Source code for preprocessing_operations

from sklearn.feature_extraction.text import CountVectorizer
import os
import pickle
import logging

[docs]def createVoabularyFeatures(lstRepos): """ Here the vocabulary-list is created by using the given list of GithubRepo-Objects :param lstRepos: list of GithubRepo-Objects :return: vocabList - list of the feature names """ # lstAllReadmes = [] lstRepoStringInfo = [] for tmpRepo in lstRepos: # load the single lines to an array lstRepoStringInfo.append(tmpRepo.getFilteredReadme(bApplyStemmer=True, bCheckStopWords=True)) lstRepoStringInfo.append(tmpRepo.getFilteredRepoDescription(bApplyStemmer=True, bCheckStopWords=True)) lstBannedWordsAddition = ['git', 'repositori', 'github', 'new', 'us', 'use', 'high', 'nasa', 'present', 'open', 'public', 'http', 'www', 'com'] # create a counter which counts the occurrence of each word which is defined in the vocabulary # by default the vocabulary consists of all words # vectorizer = CountVectorizer(min_df=3, stop_words=lstBannedWordsAddition) vectorizer = CountVectorizer(min_df=5, stop_words=lstBannedWordsAddition) # return a sparse matrix # each column is mapped to a specific feature (see lstFeatureNames) # the value describes the occurrence of the word in the current line matSparse = vectorizer.fit_transform(lstRepoStringInfo) lstFeatureNames = vectorizer.get_feature_names() return lstFeatureNames
[docs]def readVocabFromFile(strVocabPath): """ reads the stored vocab list from a given file-path :param strVocabPath: path where the vocab is stored :return: """ # http://stackoverflow.com/questions/899103/writing-a-list-to-a-file-with-python # read dump file with open(strVocabPath, 'rb') as fp: logging.debug('open vocab from file...') lstVoc = pickle.load(fp) return lstVoc
[docs]def initInputParameters(strVocabPath, lstGithubRepo): """ Initialies the vocabulary set :param strVocabPath: path were the vocab list is stored :param lstGithubRepo: list of the githubRepository-objects :return: """ # generate or read the vocab, depending if the file already exists lstVoc = [] if os.path.isfile(strVocabPath): lstVoc = readVocabFromFile(strVocabPath) else: lstVoc = createVoabularyFeatures(lstGithubRepo) # dump to file with open(strVocabPath, 'wb') as fb: logging.debug('dump vocab to file...') pickle.dump(lstVoc, fb) return lstVoc