Source code for string_operation
from bs4 import BeautifulSoup
import re
import nltk
import mimetypes
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.porter import PorterStemmer
# refine the input string
[docs]def prepare_words(raw_text, bApplyStemmer=True, bCheckStopWords=False):
"""
prepares the word for the comparision with the vocab list
:param raw_text: text with control characters, number,
:param bApplyStemmer: true if is stemming shall be applied
:param bCheckStopWords: true if stopwords shall be removed
:return: normed word list
"""
raw_text = re.sub(r'^http?:\/\/.*[\r\n]*', '', raw_text, flags=re.MULTILINE) # remove web-adresses
raw_text = re.sub(r'\\.', ' ', raw_text) # remove all control-characters: \n, \t ...
# http://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
raw_text = re.sub(r'\([^()]*\)', ' ', raw_text)
letters = re.sub("[^a-zA-Z]", " ", raw_text) # remove everything that isn't a letter
words = letters.lower().split() # write words into array
if bCheckStopWords:
words = [w for w in words if w not in stopwords.words("english") and w not in stopwords.words("german")] # remove "filler" words
if bApplyStemmer:
# see: http://www.nltk.org/howto/stem.html for more details
stemmer = PorterStemmer()
singles = [stemmer.stem(word) for word in words] # only allow words with a length higher than 2 if len(word) > 2
singles = [single for single in singles if len(single) > 2]
words = " ".join(singles)
return words # return the words as a string, separator: space
[docs]def validate_url(url_in):
"""
Performs some simple string checks to validate the URL for further processing
:param url_in: The URL to perform the checks on
:return: error: errorcode
"""
if url_in == "":
error = "[ERROR] Input is empty"
return False
elif not url_in.startswith("https://"):
error = "[ERROR] Input doesn't start with https://"
return False
elif not url_in.startswith("https://github.com/"):
error = "[ERROR] Input is not a GitHub URL"
return False
else:
error = "[INFO] Input is a valid URL"
return True
[docs]def validate_txtfile(path):
"""
Checks file type whether its txt or not
:param path: path to file
:return:
"""
bFile = True if mimetypes.guess_type(path)[0] == 'text/plain' else False
return bFile