Source code for main_semi_supervised

"""
@file: main.py.py
Created on 07.01.2017 18:20
@project: GitHubRepositoryClassifier

@author: QueensGambit

Sample usage of the repository-classifier
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn import decomposition
from sklearn.cluster import KMeans
from sklearn.semi_supervised import label_propagation
from prototype.repository_classifier import RepositoryClassifier
from prototype.utility_funcs.io_agent import InputOutputAgent
from matplotlib.colors import colorConverter

import matplotlib.patches as mpatches

import sys
from prototype.definitions.categories import CategoryStr


[docs]def main(args=None): if args is None: args = sys.argv[1:] InputOutputAgent.setWithToken(True) repoClassifier = RepositoryClassifier(bUseStringFeatures=True) strFilenameCSV = 'additional_data_sets_cleaned.csv' #lstTrainData, lstTrainLabels = repoClassifier.loadTrainingData('/data/csv/' + strFilenameCSV) #repoClassifier.trainModel(lstTrainData, lstTrainLabels) #repoClassifier.exportModelToFile() clf, lstMeanValues, matIntegerTrainingData, lstTrainLabels, lstTrainData, normalizer, normalizerIntegerAttr, lstTrainDataRaw = repoClassifier.loadModelFromFile() #repoClassifier.predictResultsAndCompare() print('Raw: ', lstTrainDataRaw) print('~~~~~~~~~~~~~ PREDICTION FROM SINGLE URL ~~~~~~~~~~~~~~~') iLabel, iLabelAlt, lstFinalPercentages, tmpRepo, lstNormedInputFeatures = repoClassifier.predictCategoryFromURL('https://github.com/akitaonrails/vimfiles') # pobox/overwatch # pobox #repoClassifier.predictCategoryFromOwnerRepoName('pobox', 'overwatch') #repoClassifier.predictCategoryFromOwnerRepoName('QueensGambit', 'Barcode-App') print('len(lstTrainData): ', len(lstTrainData)) print('len(lstTrainData[0): ', len(lstTrainData[0])) print('lstTrainData:', lstTrainData) # matIntegerTrainingData = normalizer.transform(matIntegerTrainingData) #plot_multi_dim(clf, lstTrainData, lstTrainLabels) # semisupervised(matIntegerTrainingData) semisupervised(lstTrainData)
[docs]def plot_multi_dim(clf, data, lstTrainLabels): # normalizer = preprocessing.MinMaxScaler() # normalizer = preprocessing.RobustScaler() # normalizer = preprocessing.StandardScaler() # normalizer = preprocessing.Normalizer() # # normalizer.fit(data) # data = normalizer.fit_transform(data) if data.shape[1] > 2: pca = decomposition.PCA(n_components=2) pca.fit(data) data = pca.transform(data) n_clusters = 7 kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) kmeans.fit(data) h = .02 x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1 y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') #plt.plot(multidimarray[:, 0], multidimarray[:, 1], 'k.', markersize=2) lstColors = [None] * len(lstTrainLabels) lstStrLabels = [None] * len(lstTrainLabels) plt.scatter(data[:, 0], data[:, 1], cmap=plt.cm.Paired) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) lstPatches = [None] *len(CategoryStr.lstStrCategories) for i, strCategory in enumerate(CategoryStr.lstStrCategories): lstPatches[i] = mpatches.Patch(color=CategoryStr.lstStrColors[i], label=strCategory) plt.legend(handles=lstPatches) plt.show()
[docs]def semisupervised(matIntegerTrainingData): strProjectDir = str(Path().resolve().parent) strProjPathFileNameCSV = '/data/csv/additional_data_sets_cleaned.csv' trainData = pd.read_csv(strProjectDir + strProjPathFileNameCSV, header=0, delimiter=",") lstStrCategories = ['DEV', 'HW', 'EDU', 'DOCS', 'WEB', 'DATA', 'OTHER'] lstGithubRepo = [] length = len(matIntegerTrainingData) iNumTrainData = len(trainData.index) X = matIntegerTrainingData y = np.empty(length) for i in range(iNumTrainData): # lstGithubRepo.append(GithubRepo.fromURL(trainData["URL"][i])) # skip this for now for a faster run time pass for i in range(length): if i % 2 == 0: value = lstStrCategories.index(trainData["CATEGORY"][i]) else: value = -1 y[i] = np.asarray([value], dtype=np.int) plot(X, y, lstStrCategories)
[docs]def plot(X, y, lstStrCategories): pca = decomposition.PCA(n_components=2) pca.fit(X) X = pca.transform(X) rng = np.random.RandomState(0) y_30 = np.copy(y) y_30[rng.rand(len(y)) < 0.3] = -1 y_50 = np.copy(y) y_50[rng.rand(len(y)) < 0.5] = -1 y_75 = np.copy(y) y_75[rng.rand(len(y)) < 0.8] = -1 ls50 = (label_propagation.LabelSpreading().fit(X, y_50), y_50) ls75 = (label_propagation.LabelSpreading().fit(X, y_75), y_75) ls100 = (label_propagation.LabelSpreading().fit(X, y), y) lp100 = (label_propagation.LabelPropagation().fit(X, y), y) clfLabelSpread = label_propagation.LabelSpreading() clfLabelSpread.fit(X, y_30) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02)) titles = ['Label Spreading 50%', 'Label Spreading 75%', 'Label Spreading 100%', 'Label Propagation 100%'] color_map = {-1: (1, 1, 1), 0: colorConverter.to_rgb(CategoryStr.lstStrColors[0]), 1: colorConverter.to_rgb(CategoryStr.lstStrColors[1]), 2: colorConverter.to_rgb(CategoryStr.lstStrColors[2]), 3: colorConverter.to_rgb(CategoryStr.lstStrColors[3]), 4: colorConverter.to_rgb(CategoryStr.lstStrColors[4]), 5: colorConverter.to_rgb(CategoryStr.lstStrColors[5]), 6: colorConverter.to_rgb(CategoryStr.lstStrColors[6])} cs = None for i, (clf, y_train) in enumerate((ls50, ls75, ls100, lp100)): plt.subplot(2, 2, i + 1) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) colors = [color_map[y] for y in y_train] Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, c=CategoryStr.lstStrColors, cmap=plt.cm.Paired) #plt.axis('off') plt.ylim(-1, 1) plt.xlim(-1, 1) plt.scatter(X[:, 0], X[:, 1], c=colors, cmap=plt.cm.Paired, s=80) plt.title(titles[i]) proxy = [plt.Rectangle((0,0),1,1,fc = pc.get_facecolor()[0]) for pc in cs.collections] matPredictRes = clfLabelSpread.predict(X) print('matPredictRes: ', matPredictRes) plt.legend(proxy, lstStrCategories) plt.show()
if __name__ == "__main__": main()