import datetime
import os
from os import path
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from .definitions.githubLanguages import lstLanguages
from .features.learning_features import IntFeatures
from .utility_funcs import string_operation
from .utility_funcs.io_agent import InputOutputAgent
from pathlib import Path
[docs]def auto_str(cls):
Method for auto-generating a to string-function which prints out all member-attributes
:param cls: current class
:return: cls
def __str__(self):
return '%s(%s)' % (
'\n '.join('%s=%s' % item for item in vars(self).items())
cls.__str__ = __str__
return cls
[docs]class GithubRepo:
def __init__(self, strUser, strName):
Constructor which takes two arguments to initialize the repository
:param strUser: user of the repository (e.g. "GNOME")
:param strName: name of the repository (e.g. "gimp")
self.user = strUser = strName
# print('user: ', self.user, 'name: ',
d = path.dirname(__file__)
# d = str(Path())
self.strPathJSON = d + '/json/' + strUser + '_' + strName + '.json'
self.ioAgent = InputOutputAgent(strUser, strName)
self.apiJSON, self.apiUrl, self.lstReadmePath = self.ioAgent.loadJSONdata(self.strPathJSON)
self.strDirPath_readme = os.path.abspath(os.path.join(__file__, os.pardir)) + '\\readme'
self.intFeatures = None
self.strFeatures = None
self.lstOccurrence = None
self.strFilteredReadme = None
self.dicFoundWords = None
print('url: ' + str(self.apiUrl))
except ConnectionError as e:
raise e
[docs] def getRepoDescription(self):
Gets the full description of the repository which is stored in the json-Api
If the description wasn't set, an empty string "" will be returned
:return: string which contains the description
strDescr = self.apiJSON['description']
if strDescr is None:
return ""
return strDescr
[docs] def getFilteredRepoDescription(self, bApplyStemmer=True, bCheckStopWords=False):
gets a filtered version of the description of the repository
if the description wasn't set, an empty string "" will be returned
:param bApplyStemmer: true if the words should be stripped to the stem
:param bCheckStopWords: true if known stopwords such as (the, he, and,...) should be ignored
:return: string which contains the filtered form of the description
strDescription = self.getRepoDescription()
if strDescription is not "":
# return strDescription
return string_operation.prepare_words(strDescription, bApplyStemmer, bCheckStopWords)
return ""
[docs] def getRepoLanguage(self):
Gets the language from the main json-Api-page which was assigned by github to this repository.
If no language was allocated "undetected" will be returned
:return: string which contains the language (e.g. C++, Java, Python,...)
strLanguage = self.apiJSON['language']
if strLanguage is not None:
return strLanguage
return "undetected"
[docs] def getRepoLanguageAsVector(self):
Returns an integer-list with 102 entries
All of them are set to 0 except the language which is used
:return: list
lstLangVec = [0] * len(lstLanguages)
iLangIndex = lstLanguages.index(self.getRepoLanguage())
except ValueError:
iLangIndex = lstLanguages.index("rare")
lstLangVec[iLangIndex] = 1
return lstLangVec
[docs] def getReadme(self):
Gets the raw content of the readme of the repository which can either be a or README.rst file.
The job for loading and exporting the readme is done by it's Io-Agent.
:return: string with the raw content
strMyREADME = self.ioAgent.getReadme(self.strDirPath_readme)
return strMyREADME
[docs] def getFilteredReadme(self, bApplyStemmer=True, bCheckStopWords=False):
Returns the filtered readme with prepare_words() being applied
:return: string of the filtered readme
if self.strFilteredReadme is None:
self.strFilteredReadme = string_operation.prepare_words(self.getReadme(), bApplyStemmer, bCheckStopWords)
return self.strFilteredReadme
[docs] def getDevTime(self):
Gets the devolpment time of the repository in days.
This is calculated via the difference of 'created_at' - 'updated_at'
:return: integer which
# a usual Github-Time stamp looks like this:
# "2011-10-17T15:09:52Z"
# example conversion:
# >>> s = "2010-01-01 18:48:14.631829"
# >>> datetime.datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")
strGithubTimestampFormat = "%Y-%m-%dT%H:%M:%SZ"
datStart = datetime.datetime.strptime(self.apiJSON['created_at'], strGithubTimestampFormat)
# last update is a push or change in wiki, description...
datLastUpdate = datetime.datetime.strptime(self.apiJSON['updated_at'], strGithubTimestampFormat)
iDevTime = (datLastUpdate - datStart).days
return iDevTime
[docs] def getNumOpenIssue(self):
gets the number of open issues from the json-main-page
return self.apiJSON['open_issues']
[docs] def getNumWatchers(self):
gets the number of watcher from the json-main-page
return self.apiJSON['watchers_count']
[docs] def fromURL(cls, strURL):
constructor with url instead of user, name
:param strURL: url of the github-repository
:return: calls the main-constructor
iIndexUser = 3
iIndexName = 4
lststrLabelGroup = strURL.split('/')
return cls(lststrLabelGroup[iIndexUser], lststrLabelGroup[iIndexName])
[docs] def readAttributes(self):
reads all attributes of the json-file and fills the integer-attributes
iDevTime = self.getDevTime()
self.intFeatures = IntFeatures(iSubscriberCount=self.apiJSON['subscribers_count'],
[docs] def getIntegerFeatures(self):
gets the intFeatures as a list
:return: list of the integer features
lstFeatures = [self.intFeatures.iSubscriberCount,
return lstFeatures
[docs] def getNormedFeatures(self, lstMeanValues):
returns the features which were normed by dividing them with the mean values
:param lstMeanValues: mean value of every integer feature
:return: list of the normed integer features
lstNormedFeatures = self.getIntegerFeatures()
# norm every integer feature by dividing it with it's mean value
# avoid dividing by 0
lstNormedFeatures[:] = [x / y if y != 0 else 0 for x, y in zip(lstNormedFeatures, lstMeanValues)]
return lstNormedFeatures
[docs] def getWordOccurences(self, lstVocab):
calculates the number of occurrences of the words given by the vocab list;
afterwards this list is divided by the word-length of the readme and multiplied with a factor
:param lstVocab: vocabulary which is used in the CountVectorizer of scikit-learn
:return: integer list representing the percentage-usage of the vocabulary words
vectorizer = CountVectorizer(min_df=0.5, vocabulary=lstVocab)
strFilteredReadme = ""
strFilteredReadme = self.getFilteredReadme()
getFilteredRepoDescription = self.getFilteredRepoDescription()
strFilteredReadme += getFilteredRepoDescription
strFilteredReadme += getFilteredRepoDescription
# print(strFilteredReadme)
# return a sparse matrix
# each column is mapped to a specific feature (see lstFeatureNames)
# the value describes the occurrence of the word in the current line
matSparse = vectorizer.fit_transform(strFilteredReadme.split())
lstFeatureNames = vectorizer.get_feature_names()
# print('~~~~~~~~~~ Number-of-total-occurrences ~~~~~~~~~~')
# print('--> repository: ' + self.user + '_' + + '~~~~~~~~~~')
matOccurrence = np.asarray(np.sum(matSparse, axis=0))
# flatten makes a matrix 1 dimensional
lstOccurrence = np.array(matOccurrence.flatten()).tolist() # np.array().tolist() is not needed
iHits = np.sum(self.lstOccurrence)
# divide each element by a factor to reduce the effectiveness
iLen = len(strFilteredReadme.split())
# avoid dividing by 0
if iLen == 0:
iLen = 1
if iHits == 0:
iHits = 1
self.dicFoundWords = self.getFeatureOccurences(lstFeatureNames, lstOccurrence, iMinOccurence=1)
return lstOccurrence
[docs] def getFeatureOccurences(self, lstFeatureNames, lstOccurrence, iMinOccurence=1):
gets the found words with it's number of occurrences in form of a dictionary
:param lstFeatureNames: vocab list
:param lstOccurrence: list of number of occurrences
:param iMinOccurence: minimum number of hits which are needed
:return: dictionaryObject
assert (len(lstFeatureNames) == len(lstOccurrence))
i = 0
dicFoundWords = {}
for iTmpOccurrence in lstOccurrence:
if iTmpOccurrence > iMinOccurence:
dicFoundWords[iTmpOccurrence] = lstFeatureNames[i]
i += 1
return dicFoundWords
[docs] def getDicFoundWords(self):
gets the stored dictionary-object
:return: dictionaryObject
if self.dicFoundWords is None:
raise Exception('getWordOccurences() hasn\'t been called yet')
return self.dicFoundWords
[docs] def printFeatureOccurences(self, dicFoundWords): #lstFeatureNames, lstOccurrence, iMinOccurence=1):
Prints out every feature with it's number occurence
:param lstFeatureNames: list of the given features, these are the column of the sparse-matrix
:param lstOccurrence: number of occurence of the individual features (has the same size as lstFeatureName
:param iMinOccurence: minimum threshold to print out the feature (if set to 0 all features are print out)
if len(dicFoundWords.items()) > 0:
strStopper1 = "=" * 80
strStopper2 = "-" * 80
print('detected words from the vocabulary:')
for k, v in dicFoundWords.items():
# for more beautiful print layout {:15s} and {:3d} is used
print('{:15s} {:3f}'.format(v, k)) # {:3d} for integers
[docs] def getName(self):
getter method for name
[docs] def getUser(self):
getter method for user
:return: self.user
return self.user