-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathSCC.py
More file actions
56 lines (47 loc) · 1.81 KB
/
SCC.py
File metadata and controls
56 lines (47 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
import glob,os
import tkinter
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from tkinter import *
import sys
#Loading the dataset
print('Loading the dataset')
X=[]
y=[]
code_loc='/Users/kamel/Downloads/DataAlogithmia/code25/'
name_file= ['c', 'c#', 'c++','java', 'css', 'haskell', 'html', 'java', 'javascript', 'lua', 'objective-c', 'perl', 'php', 'python','ruby', 'r', 'scala', 'sql', 'swift', 'vb.net','markdown','bash']
for item in name_file:
code_loc_current=code_loc+item+'/'
file_list = glob.glob(os.path.join(code_loc_current, "*.txt"))
i = 0
for file_path in file_list:
f=open(file_path,'r')
data=f.read()
label=item
num_lines = sum(1 for line in open(file_path))
X.append(data)
y.append(label)
#print(item)
print('Extracting features from dataset')
#Extracting features from text files
#count_vect = CountVectorizer()
count_vect = TfidfVectorizer(input ='X',stop_words = {'english'},lowercase=True,analyzer ='word')
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape
#From occurrences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
print('Training a Multinomial Naive Bayes (MNB)')
#Training a classifier¶
clf = MultinomialNB().fit(X_train_tfidf, y)
var = input("Please enter a code snippet: ")
docs_new = [var]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
print('predicted as',predicted)