Source code for string_operation

from bs4 import BeautifulSoup
import re
import nltk
import mimetypes
from nltk.corpus import stopwords   # Import the stop word list
from nltk.stem.porter import PorterStemmer

# refine the input string
[docs]def prepare_words(raw_text, bApplyStemmer=True, bCheckStopWords=False):
    """
    prepares the word for the comparision with the vocab list

    :param raw_text: text with control characters, number,
    :param bApplyStemmer: true if is stemming shall be applied
    :param bCheckStopWords: true if stopwords shall be removed
    :return: normed word list
    """

    raw_text = re.sub(r'^http?:\/\/.*[\r\n]*', '', raw_text, flags=re.MULTILINE)                     # remove web-adresses
    raw_text = re.sub(r'\\.', ' ', raw_text)                              # remove all control-characters: \n, \t ...
    # http://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python

    raw_text = re.sub(r'\([^()]*\)', ' ', raw_text)

    letters = re.sub("[^a-zA-Z]", " ", raw_text)                        # remove everything that isn't a letter

    words = letters.lower().split()                                     # write words into array

    if bCheckStopWords:
        words = [w for w in words if w not in stopwords.words("english") and w not in stopwords.words("german")]   # remove "filler" words

    if bApplyStemmer:
        # see: http://www.nltk.org/howto/stem.html for more details
        stemmer = PorterStemmer()
        singles = [stemmer.stem(word) for word in words]   # only allow words with a length higher than 2  if len(word) > 2
        singles = [single for single in singles if len(single) > 2]
        words = " ".join(singles)

    return words                                             # return the words as a string, separator: space


[docs]def validate_url(url_in):
    """
    Performs some simple string checks to validate the URL for further processing

    :param url_in: The URL to perform the checks on
    :return: error: errorcode
    """
    if url_in == "":
        error = "[ERROR] Input is empty"
        return False
    elif not url_in.startswith("https://"):
        error = "[ERROR] Input doesn't start with https://"
        return False
    elif not url_in.startswith("https://github.com/"):
        error = "[ERROR] Input is not a GitHub URL"
        return False
    else:
        error = "[INFO] Input is a valid URL"
        return True

[docs]def validate_txtfile(path):
    """
    Checks file type whether its txt or not
    :param path: path to file
    :return:
    """
    bFile = True if mimetypes.guess_type(path)[0] == 'text/plain' else False
    return bFile