import datetime
import os
from os import path
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from .definitions.githubLanguages import lstLanguages
from .features.learning_features import IntFeatures
from .utility_funcs import string_operation
from .utility_funcs.io_agent import InputOutputAgent
from pathlib import Path
# http://stackoverflow.com/questions/32910096/is-there-a-way-to-auto-generate-a-str-implementation-in-python
[docs]def auto_str(cls):
"""
Method for auto-generating a to string-function which prints out all member-attributes
:param cls: current class
:return: cls
"""
def __str__(self):
return '%s(%s)' % (
type(self).__name__,
'\n '.join('%s=%s' % item for item in vars(self).items())
)
cls.__str__ = __str__
return cls
@auto_str
[docs]class GithubRepo:
def __init__(self, strUser, strName):
"""
Constructor which takes two arguments to initialize the repository
:param strUser: user of the repository (e.g. "GNOME")
:param strName: name of the repository (e.g. "gimp")
"""
self.user = strUser
self.name = strName
# print('user: ', self.user, 'name: ', self.name)
d = path.dirname(__file__)
# d = str(Path())
self.strPathJSON = d + '/json/' + strUser + '_' + strName + '.json'
try:
self.ioAgent = InputOutputAgent(strUser, strName)
self.apiJSON, self.apiUrl, self.lstReadmePath = self.ioAgent.loadJSONdata(self.strPathJSON)
self.strDirPath_readme = os.path.abspath(os.path.join(__file__, os.pardir)) + '\\readme'
self.intFeatures = None
self.strFeatures = None
self.lstOccurrence = None
self.strFilteredReadme = None
self.dicFoundWords = None
print('url: ' + str(self.apiUrl))
self.readAttributes()
except ConnectionError as e:
raise e
[docs] def getRepoDescription(self):
"""
Gets the full description of the repository which is stored in the json-Api
If the description wasn't set, an empty string "" will be returned
:return: string which contains the description
"""
strDescr = self.apiJSON['description']
if strDescr is None:
return ""
else:
return strDescr
[docs] def getFilteredRepoDescription(self, bApplyStemmer=True, bCheckStopWords=False):
"""
gets a filtered version of the description of the repository
if the description wasn't set, an empty string "" will be returned
:param bApplyStemmer: true if the words should be stripped to the stem
:param bCheckStopWords: true if known stopwords such as (the, he, and,...) should be ignored
:return: string which contains the filtered form of the description
"""
strDescription = self.getRepoDescription()
if strDescription is not "":
# return strDescription
return string_operation.prepare_words(strDescription, bApplyStemmer, bCheckStopWords)
else:
return ""
[docs] def getRepoLanguage(self):
"""
Gets the language from the main json-Api-page which was assigned by github to this repository.
If no language was allocated "undetected" will be returned
:return: string which contains the language (e.g. C++, Java, Python,...)
"""
strLanguage = self.apiJSON['language']
if strLanguage is not None:
return strLanguage
else:
return "undetected"
[docs] def getRepoLanguageAsVector(self):
"""
Returns an integer-list with 102 entries
All of them are set to 0 except the language which is used
:return: list
"""
lstLangVec = [0] * len(lstLanguages)
try:
iLangIndex = lstLanguages.index(self.getRepoLanguage())
except ValueError:
iLangIndex = lstLanguages.index("rare")
lstLangVec[iLangIndex] = 1
return lstLangVec
[docs] def getReadme(self):
"""
Gets the raw content of the readme of the repository which can either be a README.md or README.rst file.
The job for loading and exporting the readme is done by it's Io-Agent.
:return: string with the raw content
"""
strMyREADME = self.ioAgent.getReadme(self.strDirPath_readme)
return strMyREADME
[docs] def getFilteredReadme(self, bApplyStemmer=True, bCheckStopWords=False):
"""
Returns the filtered readme with prepare_words() being applied
:return: string of the filtered readme
"""
if self.strFilteredReadme is None:
self.strFilteredReadme = string_operation.prepare_words(self.getReadme(), bApplyStemmer, bCheckStopWords)
return self.strFilteredReadme
[docs] def getDevTime(self):
"""
Gets the devolpment time of the repository in days.
This is calculated via the difference of 'created_at' - 'updated_at'
:return: integer which
"""
# a usual Github-Time stamp looks like this:
# "2011-10-17T15:09:52Z"
# example conversion: stackoverflow.com/questions/5385238/python-reading-logfile-with-timestamp-including-microseconds
# >>> s = "2010-01-01 18:48:14.631829"
# >>> datetime.datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")
strGithubTimestampFormat = "%Y-%m-%dT%H:%M:%SZ"
datStart = datetime.datetime.strptime(self.apiJSON['created_at'], strGithubTimestampFormat)
# last update is a push or change in wiki, description...
datLastUpdate = datetime.datetime.strptime(self.apiJSON['updated_at'], strGithubTimestampFormat)
iDevTime = (datLastUpdate - datStart).days
return iDevTime
[docs] def getNumOpenIssue(self):
"""
gets the number of open issues from the json-main-page
:return:
"""
return self.apiJSON['open_issues']
[docs] def getNumWatchers(self):
"""
gets the number of watcher from the json-main-page
:return:
"""
return self.apiJSON['watchers_count']
@classmethod
[docs] def fromURL(cls, strURL):
"""
constructor with url instead of user, name
:param strURL: url of the github-repository
:return: calls the main-constructor
"""
iIndexUser = 3
iIndexName = 4
lststrLabelGroup = strURL.split('/')
return cls(lststrLabelGroup[iIndexUser], lststrLabelGroup[iIndexName])
[docs] def readAttributes(self):
"""
reads all attributes of the json-file and fills the integer-attributes
:return:
"""
iDevTime = self.getDevTime()
self.intFeatures = IntFeatures(iSubscriberCount=self.apiJSON['subscribers_count'],
iOpenIssues=self.getNumOpenIssue(),
iDevTime=iDevTime,
iSize=self.apiJSON['size'])
[docs] def getIntegerFeatures(self):
"""
gets the intFeatures as a list
:return: list of the integer features
"""
lstFeatures = [self.intFeatures.iSubscriberCount,
self.intFeatures.iOpenIssues,
self.intFeatures.iDevTime,
self.intFeatures.iSize
]
return lstFeatures
[docs] def getNormedFeatures(self, lstMeanValues):
"""
returns the features which were normed by dividing them with the mean values
:param lstMeanValues: mean value of every integer feature
:return: list of the normed integer features
"""
lstNormedFeatures = self.getIntegerFeatures()
# norm every integer feature by dividing it with it's mean value
# avoid dividing by 0
lstNormedFeatures[:] = [x / y if y != 0 else 0 for x, y in zip(lstNormedFeatures, lstMeanValues)]
return lstNormedFeatures
[docs] def getWordOccurences(self, lstVocab):
"""
calculates the number of occurrences of the words given by the vocab list;
afterwards this list is divided by the word-length of the readme and multiplied with a factor
:param lstVocab: vocabulary which is used in the CountVectorizer of scikit-learn
:return: integer list representing the percentage-usage of the vocabulary words
"""
vectorizer = CountVectorizer(min_df=0.5, vocabulary=lstVocab)
strFilteredReadme = ""
strFilteredReadme = self.getFilteredReadme()
getFilteredRepoDescription = self.getFilteredRepoDescription()
strFilteredReadme += getFilteredRepoDescription
strFilteredReadme += getFilteredRepoDescription
# print(strFilteredReadme)
# return a sparse matrix
# each column is mapped to a specific feature (see lstFeatureNames)
# the value describes the occurrence of the word in the current line
matSparse = vectorizer.fit_transform(strFilteredReadme.split())
lstFeatureNames = vectorizer.get_feature_names()
# print('~~~~~~~~~~ Number-of-total-occurrences ~~~~~~~~~~')
# print('--> repository: ' + self.user + '_' + self.name + '~~~~~~~~~~')
matOccurrence = np.asarray(np.sum(matSparse, axis=0))
# flatten makes a matrix 1 dimensional
lstOccurrence = np.array(matOccurrence.flatten()).tolist() # np.array().tolist() is not needed
iHits = np.sum(self.lstOccurrence)
# divide each element by a factor to reduce the effectiveness
iLen = len(strFilteredReadme.split())
# avoid dividing by 0
if iLen == 0:
iLen = 1
if iHits == 0:
iHits = 1
self.dicFoundWords = self.getFeatureOccurences(lstFeatureNames, lstOccurrence, iMinOccurence=1)
self.printFeatureOccurences(self.dicFoundWords)
return lstOccurrence
[docs] def getFeatureOccurences(self, lstFeatureNames, lstOccurrence, iMinOccurence=1):
"""
gets the found words with it's number of occurrences in form of a dictionary
:param lstFeatureNames: vocab list
:param lstOccurrence: list of number of occurrences
:param iMinOccurence: minimum number of hits which are needed
:return: dictionaryObject
"""
assert (len(lstFeatureNames) == len(lstOccurrence))
i = 0
dicFoundWords = {}
for iTmpOccurrence in lstOccurrence:
if iTmpOccurrence > iMinOccurence:
dicFoundWords[iTmpOccurrence] = lstFeatureNames[i]
i += 1
return dicFoundWords
[docs] def getDicFoundWords(self):
"""
gets the stored dictionary-object
:return: dictionaryObject
"""
if self.dicFoundWords is None:
raise Exception('getWordOccurences() hasn\'t been called yet')
return self.dicFoundWords
[docs] def printFeatureOccurences(self, dicFoundWords): #lstFeatureNames, lstOccurrence, iMinOccurence=1):
"""
Prints out every feature with it's number occurence
:param lstFeatureNames: list of the given features, these are the column of the sparse-matrix
:param lstOccurrence: number of occurence of the individual features (has the same size as lstFeatureName
:param iMinOccurence: minimum threshold to print out the feature (if set to 0 all features are print out)
:return:
"""
if len(dicFoundWords.items()) > 0:
strStopper1 = "=" * 80
strStopper2 = "-" * 80
print(strStopper2)
print('detected words from the vocabulary:')
for k, v in dicFoundWords.items():
# for more beautiful print layout {:15s} and {:3d} is used
print('{:15s} {:3f}'.format(v, k)) # {:3d} for integers
print(strStopper2)
[docs] def getName(self):
"""
getter method for name
:return: self.name
"""
return self.name
[docs] def getUser(self):
"""
getter method for user
:return: self.user
"""
return self.user