Source code for main_semi_supervised

"""
@file: main.py.py
Created on 07.01.2017 18:20
@project: GitHubRepositoryClassifier

@author: QueensGambit

Sample usage of the repository-classifier
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn import decomposition
from sklearn.cluster import KMeans
from sklearn.semi_supervised import label_propagation
from prototype.repository_classifier import RepositoryClassifier
from prototype.utility_funcs.io_agent import InputOutputAgent
from matplotlib.colors import colorConverter

import matplotlib.patches as mpatches

import sys
from prototype.definitions.categories import CategoryStr


[docs]def main(args=None):

    if args is None:
        args = sys.argv[1:]

    InputOutputAgent.setWithToken(True)
    repoClassifier = RepositoryClassifier(bUseStringFeatures=True)

    strFilenameCSV = 'additional_data_sets_cleaned.csv'

    #lstTrainData, lstTrainLabels = repoClassifier.loadTrainingData('/data/csv/' + strFilenameCSV)
    #repoClassifier.trainModel(lstTrainData, lstTrainLabels)
    #repoClassifier.exportModelToFile()
    clf, lstMeanValues, matIntegerTrainingData, lstTrainLabels, lstTrainData, normalizer, normalizerIntegerAttr, lstTrainDataRaw = repoClassifier.loadModelFromFile()
    #repoClassifier.predictResultsAndCompare()

    print('Raw: ', lstTrainDataRaw)
    print('~~~~~~~~~~~~~ PREDICTION FROM SINGLE URL ~~~~~~~~~~~~~~~')
    iLabel, iLabelAlt, lstFinalPercentages, tmpRepo, lstNormedInputFeatures = repoClassifier.predictCategoryFromURL('https://github.com/akitaonrails/vimfiles')
    # pobox/overwatch
    # pobox
    #repoClassifier.predictCategoryFromOwnerRepoName('pobox', 'overwatch')
    #repoClassifier.predictCategoryFromOwnerRepoName('QueensGambit', 'Barcode-App')

    print('len(lstTrainData): ', len(lstTrainData))
    print('len(lstTrainData[0): ', len(lstTrainData[0]))

    print('lstTrainData:', lstTrainData)
    # matIntegerTrainingData = normalizer.transform(matIntegerTrainingData)

    #plot_multi_dim(clf, lstTrainData, lstTrainLabels)
    # semisupervised(matIntegerTrainingData)
    semisupervised(lstTrainData)


[docs]def plot_multi_dim(clf, data, lstTrainLabels):

    # normalizer = preprocessing.MinMaxScaler()
    # normalizer = preprocessing.RobustScaler()
    # normalizer = preprocessing.StandardScaler()
    # normalizer = preprocessing.Normalizer()
    #
    # normalizer.fit(data)
    # data = normalizer.fit_transform(data)

    if data.shape[1] > 2:
        pca = decomposition.PCA(n_components=2)
        pca.fit(data)
        data = pca.transform(data)

    n_clusters = 7
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    kmeans.fit(data)
    h = .02

    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')
    #plt.plot(multidimarray[:, 0], multidimarray[:, 1], 'k.', markersize=2)

    lstColors = [None] * len(lstTrainLabels)
    lstStrLabels = [None] * len(lstTrainLabels)

    plt.scatter(data[:, 0], data[:, 1], cmap=plt.cm.Paired)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

    lstPatches = [None] *len(CategoryStr.lstStrCategories)
    for i, strCategory in enumerate(CategoryStr.lstStrCategories):
        lstPatches[i] = mpatches.Patch(color=CategoryStr.lstStrColors[i], label=strCategory)

    plt.legend(handles=lstPatches)

    plt.show()


[docs]def semisupervised(matIntegerTrainingData):

    strProjectDir = str(Path().resolve().parent)
    strProjPathFileNameCSV = '/data/csv/additional_data_sets_cleaned.csv'
    trainData = pd.read_csv(strProjectDir + strProjPathFileNameCSV, header=0, delimiter=",")
    lstStrCategories = ['DEV', 'HW', 'EDU', 'DOCS', 'WEB', 'DATA', 'OTHER']
    lstGithubRepo = []

    length = len(matIntegerTrainingData)
    iNumTrainData = len(trainData.index)
    X = matIntegerTrainingData
    y = np.empty(length)

    for i in range(iNumTrainData):
        # lstGithubRepo.append(GithubRepo.fromURL(trainData["URL"][i]))  # skip this for now for a faster run time
        pass

    for i in range(length):
        if i % 2 == 0:
            value = lstStrCategories.index(trainData["CATEGORY"][i])
        else:
            value = -1

        y[i] = np.asarray([value], dtype=np.int)

    plot(X, y, lstStrCategories)


[docs]def plot(X, y, lstStrCategories):

    pca = decomposition.PCA(n_components=2)
    pca.fit(X)
    X = pca.transform(X)

    rng = np.random.RandomState(0)

    y_30 = np.copy(y)
    y_30[rng.rand(len(y)) < 0.3] = -1
    y_50 = np.copy(y)
    y_50[rng.rand(len(y)) < 0.5] = -1
    y_75 = np.copy(y)
    y_75[rng.rand(len(y)) < 0.8] = -1

    ls50 = (label_propagation.LabelSpreading().fit(X, y_50), y_50)
    ls75 = (label_propagation.LabelSpreading().fit(X, y_75), y_75)
    ls100 = (label_propagation.LabelSpreading().fit(X, y), y)
    lp100 = (label_propagation.LabelPropagation().fit(X, y), y)

    clfLabelSpread = label_propagation.LabelSpreading()
    clfLabelSpread.fit(X, y_30)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                         np.arange(y_min, y_max, .02))

    titles = ['Label Spreading 50%',
              'Label Spreading 75%',
              'Label Spreading 100%',
              'Label Propagation 100%']

    color_map = {-1: (1, 1, 1),
                 0: colorConverter.to_rgb(CategoryStr.lstStrColors[0]),
                 1: colorConverter.to_rgb(CategoryStr.lstStrColors[1]),
                 2: colorConverter.to_rgb(CategoryStr.lstStrColors[2]),
                 3: colorConverter.to_rgb(CategoryStr.lstStrColors[3]),
                 4: colorConverter.to_rgb(CategoryStr.lstStrColors[4]),
                 5: colorConverter.to_rgb(CategoryStr.lstStrColors[5]),
                 6: colorConverter.to_rgb(CategoryStr.lstStrColors[6])}

    cs = None

    for i, (clf, y_train) in enumerate((ls50, ls75, ls100, lp100)):
        plt.subplot(2, 2, i + 1)
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        colors = [color_map[y] for y in y_train]

        Z = Z.reshape(xx.shape)
        cs = plt.contourf(xx, yy, Z, c=CategoryStr.lstStrColors, cmap=plt.cm.Paired)
        #plt.axis('off')
        plt.ylim(-1, 1)
        plt.xlim(-1, 1)

        plt.scatter(X[:, 0], X[:, 1], c=colors, cmap=plt.cm.Paired, s=80)

        plt.title(titles[i])

    proxy = [plt.Rectangle((0,0),1,1,fc = pc.get_facecolor()[0])
        for pc in cs.collections]

    matPredictRes = clfLabelSpread.predict(X)
    print('matPredictRes: ', matPredictRes)

    plt.legend(proxy, lstStrCategories)
    plt.show()


if __name__ == "__main__":
    main()