Source code for preprocessing_operations
from sklearn.feature_extraction.text import CountVectorizer
import os
import pickle
import logging
[docs]def createVoabularyFeatures(lstRepos):
"""
Here the vocabulary-list is created by using the given list of GithubRepo-Objects
:param lstRepos: list of GithubRepo-Objects
:return: vocabList - list of the feature names
"""
# lstAllReadmes = []
lstRepoStringInfo = []
for tmpRepo in lstRepos:
# load the single lines to an array
lstRepoStringInfo.append(tmpRepo.getFilteredReadme(bApplyStemmer=True, bCheckStopWords=True))
lstRepoStringInfo.append(tmpRepo.getFilteredRepoDescription(bApplyStemmer=True, bCheckStopWords=True))
lstBannedWordsAddition = ['git', 'repositori', 'github', 'new', 'us', 'use', 'high', 'nasa', 'present', 'open', 'public', 'http', 'www', 'com']
# create a counter which counts the occurrence of each word which is defined in the vocabulary
# by default the vocabulary consists of all words
# vectorizer = CountVectorizer(min_df=3, stop_words=lstBannedWordsAddition)
vectorizer = CountVectorizer(min_df=5, stop_words=lstBannedWordsAddition)
# return a sparse matrix
# each column is mapped to a specific feature (see lstFeatureNames)
# the value describes the occurrence of the word in the current line
matSparse = vectorizer.fit_transform(lstRepoStringInfo)
lstFeatureNames = vectorizer.get_feature_names()
return lstFeatureNames
[docs]def readVocabFromFile(strVocabPath):
"""
reads the stored vocab list from a given file-path
:param strVocabPath: path where the vocab is stored
:return:
"""
# http://stackoverflow.com/questions/899103/writing-a-list-to-a-file-with-python
# read dump file
with open(strVocabPath, 'rb') as fp:
logging.debug('open vocab from file...')
lstVoc = pickle.load(fp)
return lstVoc