Source code for prototype.github_repo

import datetime
import os
from os import path

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from .definitions.githubLanguages import lstLanguages
from .features.learning_features import IntFeatures
from .utility_funcs import string_operation
from .utility_funcs.io_agent import InputOutputAgent
from pathlib import Path
# http://stackoverflow.com/questions/32910096/is-there-a-way-to-auto-generate-a-str-implementation-in-python
[docs]def auto_str(cls):
    """
    Method for auto-generating a to string-function which prints out all member-attributes

    :param cls: current class
    :return: cls
    """
    def __str__(self):
        return '%s(%s)' % (
            type(self).__name__,
            '\n '.join('%s=%s' % item for item in vars(self).items())
        )

    cls.__str__ = __str__
    return cls


@auto_str
[docs]class GithubRepo:

    def __init__(self, strUser, strName):
        """
        Constructor which takes two arguments to initialize the repository

        :param strUser: user of the repository (e.g. "GNOME")
        :param strName: name of the repository (e.g. "gimp")
        """
        self.user = strUser
        self.name = strName
        # print('user: ', self.user, 'name: ', self.name)

        d = path.dirname(__file__)
        # d = str(Path())

        self.strPathJSON = d + '/json/' + strUser + '_' + strName + '.json'

        try:
            self.ioAgent = InputOutputAgent(strUser, strName)
            self.apiJSON, self.apiUrl, self.lstReadmePath = self.ioAgent.loadJSONdata(self.strPathJSON)

            self.strDirPath_readme = os.path.abspath(os.path.join(__file__, os.pardir)) + '\\readme'

            self.intFeatures = None
            self.strFeatures = None
            self.lstOccurrence = None
            self.strFilteredReadme = None
            self.dicFoundWords = None

            print('url: ' + str(self.apiUrl))
            self.readAttributes()
        except ConnectionError as e:
            raise e


[docs]    def getRepoDescription(self):
        """
        Gets the full description of the repository which is stored in the json-Api
        If the description wasn't set, an empty string "" will be returned

        :return: string which contains the description
        """
        strDescr = self.apiJSON['description']
        if strDescr is None:
            return ""
        else:
            return strDescr

[docs]    def getFilteredRepoDescription(self, bApplyStemmer=True, bCheckStopWords=False):
        """
        gets a filtered version of the description of the repository
        if the description wasn't set, an empty string "" will be returned

        :param bApplyStemmer: true if the words should be stripped to the stem
        :param bCheckStopWords: true if known stopwords such as (the, he, and,...) should be ignored
        :return: string which contains the filtered form of the description
        """
        strDescription = self.getRepoDescription()
        if strDescription is not "":
            # return strDescription
            return string_operation.prepare_words(strDescription, bApplyStemmer, bCheckStopWords)
        else:
            return ""

[docs]    def getRepoLanguage(self):
        """
        Gets the language from the main json-Api-page which was assigned by github to this repository.
        If no language was allocated "undetected" will be returned

        :return: string which contains the language (e.g. C++, Java, Python,...)
        """
        strLanguage = self.apiJSON['language']
        if strLanguage is not None:
            return strLanguage
        else:
            return "undetected"

[docs]    def getRepoLanguageAsVector(self):
        """
        Returns an integer-list with 102 entries
        All of them are set to 0 except the language which is used

        :return: list
        """
        lstLangVec = [0] * len(lstLanguages)
        try:
            iLangIndex = lstLanguages.index(self.getRepoLanguage())
        except ValueError:
            iLangIndex = lstLanguages.index("rare")

        lstLangVec[iLangIndex] = 1

        return lstLangVec


[docs]    def getReadme(self):
        """
        Gets the raw content of the readme of the repository which can either be a README.md or README.rst file.
        The job for loading and exporting the readme is done by it's Io-Agent.

        :return: string with the raw content
        """
        strMyREADME = self.ioAgent.getReadme(self.strDirPath_readme)
        return strMyREADME

[docs]    def getFilteredReadme(self, bApplyStemmer=True, bCheckStopWords=False):
        """
        Returns the filtered readme with prepare_words() being applied

        :return: string of the filtered readme
        """
        if self.strFilteredReadme is None:
            self.strFilteredReadme = string_operation.prepare_words(self.getReadme(), bApplyStemmer, bCheckStopWords)

        return self.strFilteredReadme

[docs]    def getDevTime(self):
        """
        Gets the devolpment time of the repository in days.
        This is calculated via the difference of 'created_at' - 'updated_at'

        :return: integer which
        """
        # a usual Github-Time stamp looks like this:
        # "2011-10-17T15:09:52Z"

        # example conversion: stackoverflow.com/questions/5385238/python-reading-logfile-with-timestamp-including-microseconds
        # >>> s = "2010-01-01 18:48:14.631829"
        # >>> datetime.datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")

        strGithubTimestampFormat = "%Y-%m-%dT%H:%M:%SZ"
        datStart = datetime.datetime.strptime(self.apiJSON['created_at'], strGithubTimestampFormat)
        # last update is a push or change in wiki, description...

        datLastUpdate = datetime.datetime.strptime(self.apiJSON['updated_at'], strGithubTimestampFormat)
        iDevTime = (datLastUpdate - datStart).days

        return iDevTime

[docs]    def getNumOpenIssue(self):
        """
        gets the number of open issues from the json-main-page

        :return:
        """
        return self.apiJSON['open_issues']

[docs]    def getNumWatchers(self):
        """
        gets the number of watcher from the json-main-page
        :return:
        """
        return self.apiJSON['watchers_count']

    @classmethod
[docs]    def fromURL(cls, strURL):
        """
        constructor with url instead of user, name

        :param strURL: url of the github-repository
        :return: calls the main-constructor
        """
        iIndexUser = 3
        iIndexName = 4
        lststrLabelGroup = strURL.split('/')
        return cls(lststrLabelGroup[iIndexUser], lststrLabelGroup[iIndexName])


[docs]    def readAttributes(self):
        """
        reads all attributes of the json-file and fills the integer-attributes

        :return:
        """
        iDevTime = self.getDevTime()

        self.intFeatures = IntFeatures(iSubscriberCount=self.apiJSON['subscribers_count'],
                                       iOpenIssues=self.getNumOpenIssue(),
                                       iDevTime=iDevTime,
                                       iSize=self.apiJSON['size'])


[docs]    def getIntegerFeatures(self):
        """
        gets the intFeatures as a list

        :return: list of the integer features
        """
        lstFeatures = [self.intFeatures.iSubscriberCount,
                       self.intFeatures.iOpenIssues,
                       self.intFeatures.iDevTime,
                       self.intFeatures.iSize
                       ]

        return lstFeatures

[docs]    def getNormedFeatures(self, lstMeanValues):
        """
        returns the features which were normed by dividing them with the mean values

        :param lstMeanValues: mean value of every integer feature
        :return: list of the normed integer features
        """
        lstNormedFeatures = self.getIntegerFeatures()
        # norm every integer feature by dividing it with it's mean value
        # avoid dividing by 0
        lstNormedFeatures[:] = [x / y if y != 0 else 0 for x, y in zip(lstNormedFeatures, lstMeanValues)]
        return lstNormedFeatures

[docs]    def getWordOccurences(self, lstVocab):
        """
        calculates the number of occurrences of the words given by the vocab list;
        afterwards this list is divided by the word-length of the readme and multiplied with a factor

        :param lstVocab: vocabulary which is used in the CountVectorizer of scikit-learn
        :return: integer list representing the percentage-usage of the vocabulary words
        """

        vectorizer = CountVectorizer(min_df=0.5, vocabulary=lstVocab)

        strFilteredReadme = ""
        strFilteredReadme = self.getFilteredReadme()
        getFilteredRepoDescription = self.getFilteredRepoDescription()
        strFilteredReadme += getFilteredRepoDescription
        strFilteredReadme += getFilteredRepoDescription

        # print(strFilteredReadme)

        # return a sparse matrix
        # each column is mapped to a specific feature (see lstFeatureNames)
        # the value describes the occurrence of the word in the current line
        matSparse = vectorizer.fit_transform(strFilteredReadme.split())

        lstFeatureNames = vectorizer.get_feature_names()

        # print('~~~~~~~~~~ Number-of-total-occurrences ~~~~~~~~~~')
        # print('--> repository: ' + self.user + '_' + self.name + '~~~~~~~~~~')
        matOccurrence = np.asarray(np.sum(matSparse, axis=0))

        # flatten makes a matrix 1 dimensional
        lstOccurrence = np.array(matOccurrence.flatten()).tolist()    # np.array().tolist() is not needed

        iHits = np.sum(self.lstOccurrence)

        # divide each element by a factor to reduce the effectiveness
        iLen = len(strFilteredReadme.split())

        # avoid dividing by 0
        if iLen == 0:
            iLen = 1

        if iHits == 0:
            iHits = 1

        self.dicFoundWords = self.getFeatureOccurences(lstFeatureNames, lstOccurrence, iMinOccurence=1)
        self.printFeatureOccurences(self.dicFoundWords)

        return lstOccurrence

[docs]    def getFeatureOccurences(self, lstFeatureNames, lstOccurrence, iMinOccurence=1):
        """
        gets the found words with it's number of occurrences in form of a dictionary

        :param lstFeatureNames: vocab list
        :param lstOccurrence: list of number of occurrences
        :param iMinOccurence: minimum number of hits which are needed
        :return: dictionaryObject
        """
        assert (len(lstFeatureNames) == len(lstOccurrence))
        i = 0
        dicFoundWords = {}
        for iTmpOccurrence in lstOccurrence:
            if iTmpOccurrence > iMinOccurence:
                dicFoundWords[iTmpOccurrence] = lstFeatureNames[i]
            i += 1

        return dicFoundWords

[docs]    def getDicFoundWords(self):
        """
        gets the stored dictionary-object
        :return: dictionaryObject
        """

        if self.dicFoundWords is None:
            raise Exception('getWordOccurences() hasn\'t been called yet')
        return self.dicFoundWords

[docs]    def printFeatureOccurences(self, dicFoundWords): #lstFeatureNames, lstOccurrence, iMinOccurence=1):
        """
        Prints out every feature with it's number occurence

        :param lstFeatureNames:     list of the given features, these are the column of the sparse-matrix
        :param lstOccurrence:       number of occurence of the individual features (has the same size as lstFeatureName
        :param iMinOccurence:       minimum threshold to print out the feature (if set to 0 all features are print out)
        :return:
        """

        if len(dicFoundWords.items()) > 0:
            strStopper1 = "=" * 80
            strStopper2 = "-" * 80
            print(strStopper2)
            print('detected words from the vocabulary:')

            for k, v in dicFoundWords.items():
                # for more beautiful print layout {:15s} and {:3d} is used
                print('{:15s} {:3f}'.format(v, k))  # {:3d} for integers

            print(strStopper2)

[docs]    def getName(self):
        """
        getter method for name

        :return: self.name
        """
        return self.name

[docs]    def getUser(self):
        """
        getter method for user

        :return: self.user
        """
        return self.user