Source code for prototype.repository_classifier

from operator import add

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.neighbors.nearest_centroid import NearestCentroid

from os import path

from .utility_funcs.preprocessing_operations import initInputParameters, readVocabFromFile
from .interface_repository_classifier import Interface_RepoClassifier

from sklearn import preprocessing

import os
from .github_repo import GithubRepo
from pathlib import Path

[docs]class RepositoryClassifier(Interface_RepoClassifier):

    def __init__(self, bUseStringFeatures=True):
        """
        constructor which initializes member variables

        """

        self.bModelLoaded = False
        self.bModelTrained = False
        self.clf = None
        self.lstMeanValues = None
        self.lstVoc = None
        self.stdScaler = None
        self.lstTrainLabels = None
        self.lstTrainData = None
        self.normalizer = None
        self.bUseCentroids = True
        self.normalizerIntegerAttr = None
        # self.scaler = None
        self.lstTrainDataRaw = None

        self.lstStrCategories = ['DEV', 'HW', 'EDU', 'DOCS', 'WEB', 'DATA', 'OTHER']

        self.directory = path.dirname(__file__)
        print(self.directory)

        self.bUseStringFeatures = bUseStringFeatures

        # get the project-directory
        self.strProjectDir = str(Path().resolve().parent)

        print('strProjectDir:', self.strProjectDir)

        self.strModelPath = self.directory + '/model/'

        # Create model-directory if needed
        if not os.path.exists(self.strModelPath):
            os.makedirs(self.strModelPath)
        self.strModelFileName = 'RepositoryClassifier.pkl'
        self.strLstMeanValuesFileName = 'lstMeanValues.pkl'
        self.strMatIntegerTrainingData = 'matIntegerTrainingData.pkl'
        self.strLstTrainLabels = 'lstTrainLabels.pkl'
        self.strLstTrainData = 'lstTrainData.pkl'
        self.strNormalizer = 'normalizer.pkl'
        self.strNormalizerIntegerAttr = 'normalizerIntegerAttr.pkl'
        self.strLstTrainDataRaw = 'lstTrainDataRaw.pkl'

        self.iNumCategories = len(self.lstStrCategories)

        self.matIntegerTrainingData = []

[docs]    def loadTrainingData(self, strProjPathFileNameCSV ='/data/csv/additional_data_sets_cleaned.csv', externalpath=None):
        """
        trains the model with a given csv-file. the csv file must have 2 columns URL and CATEGORY.
        the URL is given in the form 'https://github.com/owner/repository-name'
        the CATEGORY is given by one of these options 'DEV', 'HW', 'EDU', 'DOCS', 'WEB', 'DATA', 'OTHER'

        :param strProjPathFileNameCSV: file path relative to the project-path where the csv-file is stored
        :return: self.lstTrainData (the scaled and normed data with which the model was trained with),
         self.lstTrainLabels (the used training labels)
        """
        trainData = None

        if externalpath is None:
            trainData = pd.read_csv(self.directory + strProjPathFileNameCSV, header=0, delimiter=",")
        else:
            trainData = pd.read_csv(strProjPathFileNameCSV, header=0, delimiter=",")

        iNumTrainData = len(trainData.index)
        print("iNumTrainData: ", iNumTrainData)

        print('~~~~~~~~~~ EXTRACTING FEATURES ~~~~~~~~~~')

        lstGithubRepo = []

        for i in range(iNumTrainData):
            # fill the list with GithubRepo-Objects
            lstGithubRepo.append(GithubRepo.fromURL(trainData["URL"][i]))

        # fill the train and the label-data
        self.lstTrainData = []
        self.lstTrainDataRaw = []
        self.lstTrainLabels = []

        print('~~~~~~~~~~ CALCULATE THE MEAN VALUES ~~~~~~~~~~')
        self.lstMeanValues = [0] * 7
        i = 0
        for tmpRepo in lstGithubRepo:

            # lstMeanValues += tmpGithubRepo.getIntegerFeatures()
            self.lstMeanValues = list(map(add, self.lstMeanValues, tmpRepo.getIntegerFeatures()))

            # find the according label as an intger for the current repository
            # the label is defined in trainData
            self.lstTrainLabels.append(self.lstStrCategories.index(trainData["CATEGORY"][i]))
            i += 1

        # Divide each element with the number of training data
        self.lstMeanValues[:] = [x / iNumTrainData for x in self.lstMeanValues]

        print('lstMeanValues: ', self.lstMeanValues)

        print('~~~~~~~~~~ GET THE VOCABULARY ~~~~~~~~~~')
        strVocabPath = self.directory + '/vocab/'
        # Create vocab-directory if needed directory
        if not os.path.exists(strVocabPath):
            os.makedirs(strVocabPath)
        strVocabPath += 'vocabList.dump'
        self.lstVoc = initInputParameters(strVocabPath, lstGithubRepo)

        print('lstVoc: ', self.lstVoc)
        print('len(lstVoc): ', len(self.lstVoc))

        lstInputFeatures = []
        lstInputFeaturesRaw = []

        for tmpRepo in lstGithubRepo:

            lstIntegerAttributes = tmpRepo.getNormedFeatures(self.lstMeanValues)

            lstInputFeaturesRaw = tmpRepo.getIntegerFeatures()
            lstInputFeatures = lstIntegerAttributes

            self.matIntegerTrainingData.append(tmpRepo.getNormedFeatures(self.lstMeanValues))


            if self.bUseStringFeatures:
                lstInputFeatures += tmpRepo.getWordOccurences(self.lstVoc)
                lstInputFeaturesRaw += tmpRepo.getWordOccurences(self.lstVoc)
            lstInputFeatures += tmpRepo.getRepoLanguageAsVector()
            lstInputFeaturesRaw += tmpRepo.getRepoLanguageAsVector()

            # test using unnormed features
            self.lstTrainData.append(lstInputFeatures)
            self.lstTrainDataRaw.append(lstInputFeaturesRaw)

        print("lstTrainData:")
        print(self.lstTrainData)

        print("lstTrainLabels:")
        print(self.lstTrainLabels)

        print('self.matIntegerTrainingData')
        print(self.matIntegerTrainingData)
        print('~~~~~~~~~~ NORMALIZE ~~~~~~~~~~~~~')

        self.normalizer = preprocessing.Normalizer()
        self.normalizer.fit(self.lstTrainData)

        self.normalizerIntegerAttr = preprocessing.Normalizer()
        self.normalizerIntegerAttr.fit(self.matIntegerTrainingData)

        self.lstTrainData = self.normalizer.transform(self.lstTrainData)

        return self.lstTrainData, self.lstTrainLabels

[docs]    def trainModel(self, lstTrainData, lstTrainLabels):
        """
        trains the model called self.clf with the given trainData and trainLabels

        :param lstTrainData: list
        :param lstTrainLabels:
        :return:
        """
        print('~~~~~~~~~~ TRAIN THE MODEL ~~~~~~~~~~')
        # train the nearest neighbour-model
        # "the shrink_threshold" parameter has only negative impact on the prediction results
        self.clf = NearestCentroid()

        # test out other classifiers
        # self.clf = KNeighborsClassifier()
        # self.clf = SVC()
        # self.clf = RadiusNeighborsClassifier(radius=100)
        # self.clf = MLPClassifier()
        # self.clf = GaussianProcessClassifier()
        # self.clf = LogisticRegression()

        # self.fit_transform()
        self.clf.fit(lstTrainData, lstTrainLabels)

        # this will break the machine
        # self.plotTheResult(lstTrainData, lstTrainLabels)

        self.bModelTrained = True


[docs]    def plotTheResult(self,lstTrainData, lstTrainLabels):
        """
        this is currently empty -> see the plots in the GUI instead

        :param lstTrainData: matrix which was used for training
        :param lstTrainLabels: labels which were used for training
        :return:
        """
        pass


[docs]    def exportModelToFile(self):
        """
        exports the trained model and the mean values of the input variables to './model/'
        the export is done via joblib.dump() to .pkl-file

        :return:
        """

        if self.bModelTrained:
            print('~~~~~~~~~~ SAVE MODEL TO FILE ~~~~~~~')
            # http://scikit-learn.org/stable/modules/model_persistence.html
            # http://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn

            # save the trained classifier to a file
            joblib.dump(self.clf, self.strModelPath + self.strModelFileName)
            joblib.dump(self.lstMeanValues, self.strModelPath + self.strLstMeanValuesFileName)
            joblib.dump(self.matIntegerTrainingData, self.strModelPath + self.strMatIntegerTrainingData)
            joblib.dump(self.lstTrainLabels, self.strModelPath + self.strLstTrainLabels)
            joblib.dump(self.lstTrainData, self.strModelPath + self.strLstTrainData)
            joblib.dump(self.normalizer, self.strModelPath + self.strNormalizer)
            joblib.dump(self.normalizerIntegerAttr, self.strModelPath + self.strNormalizerIntegerAttr)
            joblib.dump(self.lstTrainDataRaw, self.strModelPath + self.strLstTrainDataRaw)

[docs]    def loadModelFromFile(self):
        """
        loads / imports the model-object from './model/RepositoryClassifier.pkl'
        and the list of the mean values from './model/lstMeanValues.pkl'

        :return:
        """

        print('~~~~~~~~~~ LOAD THE MODEL ~~~~~~~~~~~')

        # load the classifier from the file
        self.clf = joblib.load(self.strModelPath + self.strModelFileName)
        self.lstMeanValues = joblib.load(self.strModelPath + self.strLstMeanValuesFileName)
        # load the integer training data for later plotting
        self.matIntegerTrainingData = joblib.load(self.strModelPath + self.strMatIntegerTrainingData)
        self.lstTrainLabels = joblib.load(self.strModelPath + self.strLstTrainLabels)
        self.lstTrainData = joblib.load(self.strModelPath + self.strLstTrainData)
        self.normalizer = joblib.load(self.strModelPath + self.strNormalizer)
        self.normalizerIntegerAttr = joblib.dump(self.normalizerIntegerAttr, self.strModelPath + self.strNormalizerIntegerAttr)
        self.lstTrainDataRaw = joblib.load(self.strModelPath + self.strLstTrainDataRaw)


        print('lstMeanValues: ', self.lstMeanValues)
        print('~~~~~~~~~~ GET THE VOCABULARY ~~~~~~~~~~')

        strVocabPath = self.directory + '/vocab/'
        strVocabPath += 'vocabList.dump'
        self.lstVoc = readVocabFromFile(strVocabPath)
        # only print out the first 7 and the last 7 entries
        # http://stackoverflow.com/questions/646644/how-to-get-last-items-of-a-list-in-python
        print('len(self.lstVoc):', len(self.lstVoc))
        if len(self.lstVoc) > 14:
            print("[", end="")
            print(*self.lstVoc[:7], sep=", ", end=" ")
            print('...', end=" ")
            print(*self.lstVoc[-7:], sep=", ", end="")
            print("]")

        self.bModelLoaded = True

        return self.clf, self.lstMeanValues, self.matIntegerTrainingData, self.lstTrainLabels, self.lstTrainData, self.normalizer, self.normalizerIntegerAttr, self.lstTrainDataRaw

[docs]    def predictResultsAndCompare(self, strProjPathFileNameCSV =  '/data/csv/manual_classification_appendix_b.csv'):  # '/data/csv/additional_data_sets_cleaned.csv'):
        """
        loads a csv-file with of layout 'URL, CATEGORY, CATEGORY_ALTERNATIVE_1,CATEGORY_ALTERNATIVE_2'
        the URL is given in the format 'https://github.com/owner/repository-name'
        the CATEGORY, CATEGORY_ALTERNATIVE_1,CATEGORY_ALTERNATIVE_2 is given by one of these options 'DEV', 'HW', 'EDU',
         'DOCS', 'WEB', 'DATA', 'OTHER'
        After the predicition phase the result is compared with the given CATEGORY and CATEGORY_ALTERNATIVES
        A verification matrix is created and the accuracy is calculated from 0.0 to 1.0

        :param strProjPathFileNameCSV: path relative to the project-path where the csv file is stored
        :return: the accuracy value (0.0 - 1.0)
        """

        if not self.bModelLoaded and not self.bModelTrained:
            print('the model hasn\'t been loaded or trained yet')
            return

        print('~~~~~~~~~~ CREATE VERITY COMP MATRIX ~~~~~~~~')

        print('~~~~~~~~~~ PREDICT RESULTS ~~~~~~~~~~')
        # classify the result
        # read the unlabeled data set from a csv
        dtUnlabeledData = pd.read_csv(self.directory + strProjPathFileNameCSV, header=0, delimiter=",")  # , nrows=iNumOfPredictions)

        # http://stackoverflow.com/questions/15943769/how-to-get-row-count-of-pandas-dataframe
        iNumOfPredictions = len(dtUnlabeledData.index)

        print('~~~~~~~~~~~ CREATE VERITY MATRIX ~~~~~~~~~~~~')
        matPredictionTarget = np.zeros((iNumOfPredictions, self.iNumCategories))

        # use a verity matrix to validate the result
        matPredictionRes = np.copy(matPredictionTarget)
        matPredictionResWithAlt = np.copy(matPredictionTarget)

        for i in range(iNumOfPredictions):

            # set the verity matrix
            strTarget = dtUnlabeledData["CATEGORY"][i]
            strTargetAlt1 = dtUnlabeledData["CATEGORY_ALTERNATIVE_1"][i]
            strTargetAlt2 = dtUnlabeledData["CATEGORY_ALTERNATIVE_2"][i]

            print('strTarget: ', strTarget)

            if pd.notnull(strTargetAlt1):
                print('strTargetAlt1:', strTargetAlt1)
                matPredictionTarget[i, self.lstStrCategories.index(strTargetAlt1)] = 1

            if pd.notnull(strTargetAlt2):
                print('strTargetAlt2:', strTargetAlt2)
                matPredictionTarget[i, self.lstStrCategories.index(strTargetAlt2)] = 1

            iLabel, iLabelAlt, lstFinalPercentages, tmpRepo, lstNormedInputFeatures = self.predictCategoryFromURL(dtUnlabeledData["URL"][i])

            matPredictionTarget[i, self.lstStrCategories.index(strTarget)] = 1

            print()

            matPredictionRes[i, iLabel] = 1
            matPredictionResWithAlt[i, iLabel] = 1
            matPredictionResWithAlt[i, iLabelAlt] = 1

        matPredictionResultByCategory = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]

        for i in range(0, iNumOfPredictions):
            for j in range(0, 7):
                if matPredictionRes[i][j] == 1:
                    if matPredictionRes[i][j] == matPredictionTarget[i][j]:
                        matPredictionResultByCategory[j][0] += 1
                        matPredictionResultByCategory[j][1] += 1
                        print("i, j", i, j, matPredictionResultByCategory)
                    else:
                        matPredictionResultByCategory[j][0] += 1
                        print("i, j, not", i, j, matPredictionResultByCategory)

        for i in range (0, len(matPredictionResultByCategory)):
            matPredictionResultByCategory[i][2] = matPredictionResultByCategory[i][1] / matPredictionResultByCategory[i][0]

        self.__printResult(tmpRepo, iLabel, iLabelAlt)

        print('verity matrix for matPredictionTarget:\n ', matPredictionTarget)
        print('verity matrix for matPredictionRes:\n ', matPredictionRes)

        matCompRes = np.multiply(matPredictionTarget, matPredictionRes)
        matCompResAlt = np.multiply(matPredictionTarget, matPredictionResWithAlt)
        fPredictionRes = sum(matCompRes.flatten()) / iNumOfPredictions
        fPredictionResWithAlt = sum(matCompResAlt.flatten()) / iNumOfPredictions
        print('fPredictionRes:', fPredictionRes)
        print('fPredictionResWithAlt:', fPredictionResWithAlt)
        fAccuracy = fPredictionRes * 100
        print('fAccuracy: ', fAccuracy, '%\n')

        print('DEV: found:', matPredictionResultByCategory[0][0], ', correct:', matPredictionResultByCategory[0][1], ', reliability:', matPredictionResultByCategory[0][2] * 100, '%',
              '\nHW: found:', matPredictionResultByCategory[1][0], ', correct:', matPredictionResultByCategory[1][1], ', reliability:', matPredictionResultByCategory[1][2] * 100, '%',
              '\nEDU: found:', matPredictionResultByCategory[2][0], ', correct:', matPredictionResultByCategory[2][1], ', reliability:', matPredictionResultByCategory[2][2] * 100, '%',
              '\nDOCS: found:', matPredictionResultByCategory[3][0], ', correct:', matPredictionResultByCategory[3][1], ', reliability:', matPredictionResultByCategory[3][2] * 100, '%',
              '\nWEB: found:', matPredictionResultByCategory[4][0], ', correct:', matPredictionResultByCategory[4][1], ', reliability:', matPredictionResultByCategory[4][2] * 100, '%',
              '\nDATA: found:', matPredictionResultByCategory[5][0], ', correct:', matPredictionResultByCategory[5][1], ', reliability:', matPredictionResultByCategory[5][2] * 100, '%',
              '\nOTHER: found:', matPredictionResultByCategory[6][0], ', correct:', matPredictionResultByCategory[6][1], ', reliability:', matPredictionResultByCategory[6][2] * 100, '%')

        return fPredictionRes

[docs]    def predictCategoryFromOwnerRepoName(self, strUser, strRepoName):
        """
        predicts the category for a repository which is given by the user and repo-name

        :param strUser: owner of the repository
        :param strRepoName: name of the repository
        :return:
        """
        tmpRepo = GithubRepo(strUser, strRepoName)

        return self.predictCategoryFromGitHubRepoObj(tmpRepo)

[docs]    def predictCategoryFromURL(self, strGitHubRepoURL):
        """
        loads the features of a given repository by URL and the model predicts its category-label

        :param strGitHubRepoURL: url to the repository
        :return: label value form 0 - 6, lst of the precentages for the other categories
        """
        try:
            tmpRepo = GithubRepo.fromURL(strGitHubRepoURL)
        except Exception as ex:
            raise ex

        return self.predictCategoryFromGitHubRepoObj(tmpRepo)

[docs]    def predictCategoryFromGitHubRepoObj(self, tmpRepo):
        """
        predicts the category for a GithubRepo-Object
        :param tmpRepo: GithubRepo-Object
        :return: iLabel, iLabelAlt, lstFinalPercentages, tmpRepo, lstNormedInputFeatures
        """

        lstNormedInputFeatures = tmpRepo.getNormedFeatures(self.lstMeanValues)
        if self.bUseStringFeatures:
            lstNormedInputFeatures += tmpRepo.getWordOccurences(self.lstVoc)
        lstNormedInputFeatures += tmpRepo.getRepoLanguageAsVector()

        # apply pre-processing
        lstNormedInputFeatures = np.array(lstNormedInputFeatures).reshape(1, len(lstNormedInputFeatures))

        lstNormedInputFeatures = self.normalizer.transform(lstNormedInputFeatures)

        # reshape Input Features -> otherwise a deprecation warning occurs
        iLabel = int(self.clf.predict(lstNormedInputFeatures))

        if self.bUseCentroids is True:
            matCentroids = self.clf.centroids_
            lstFinalPercentages = self.predictProbaNearestCentroids(matCentroids, lstNormedInputFeatures)
        else:
            lstFinalPercentages = self.clf.predict_proba(lstNormedInputFeatures)

        iLabelAlt = self.getLabelAlternative(lstFinalPercentages)

        self.__printResult(tmpRepo, iLabel, iLabelAlt, bPrintWordHits=False)

        return iLabel, iLabelAlt, lstFinalPercentages, tmpRepo, lstNormedInputFeatures

[docs]    def getLabelAlternative(self, lstFinalPercentages):
        """
        gets the first alternative (the seoond result)

        :param lstFinalPercentages: percentages lsit for the single categories
        :return: integer label which describes the category
        """
        # copy the percentages in an additional list
        lstFinalPercentagesCopy = []
        lstFinalPercentagesCopy = lstFinalPercentages[:]

        # get the s
        iMaxIndex = lstFinalPercentagesCopy.index(max(lstFinalPercentagesCopy))

        lstFinalPercentagesCopy[iMaxIndex] = 0
        iSecondMaxIndex = lstFinalPercentagesCopy.index(max(lstFinalPercentagesCopy))

        return iSecondMaxIndex


[docs]    def predictProbaNearestCentroids(self, matCentroids, lstInputFeatures):
        """
        because predictProba was missing in the default functionality for nearest-centroid
        the probability is now calculated via the distances to the different centroids

        :param matCentroids: matrix of the centroids for each category
        :param lstInputFeatures: full normed input feature list for which the prediction is based on
        :return:
        """
        lstFinalPercentages = []
        fDistSum = 0
        lstDistances = []

        for i, centroid in enumerate(matCentroids):
            fDist = np.linalg.norm([lstInputFeatures] - centroid)
            lstDistances.append((i, fDist))
            fDistSum += fDist

        lstDistances.sort(key=lambda x: x[1])

        lstPercentages = []

        for i, fDist in enumerate(lstDistances):
            lstPercentages.append(lstDistances[i][1] / fDistSum)

        lstDistancesReordered = []

        for i, fPercentage in enumerate(reversed(lstPercentages)):
            lstDistancesReordered.append((lstDistances[i][0], fPercentage))

        lstDistancesReordered.sort(key=lambda x: x[0])

        for i, fPercentage in enumerate(lstDistancesReordered):
            lstFinalPercentages.append(fPercentage[1])
            print('{:15s} {:3f}'.format(self.lstStrCategories[i],  fPercentage[1]))

        return lstFinalPercentages

    def __printResult(self, tmpRepo, iLabel, iLabelAlt, bPrintWordHits=False):
        """
        prints the repository name and its category by using the iLabel

        :param tmpRepo: given repository
        :param iLabel: previously predicted label
        :return:
        """

        strStopper1 = "=" * 80
        strStopper2 = "-" * 80

        print(strStopper1)
        if bPrintWordHits is True:
            if self.bUseStringFeatures:
                lstOccurence = tmpRepo.getWordOccurences(self.lstVoc)
                tmpRepo.printFeatureOccurences(tmpRepo.getDicFoundWords())

        print('Prediction for ' + tmpRepo.getName() + ', ' + tmpRepo.getUser() + ': ', end="")

        print(self.lstStrCategories[iLabel])
        if iLabelAlt is not None:
            print('Alternative: ', self.lstStrCategories[iLabelAlt])

        print(strStopper2)