Merge pull request #10 from isselab/cleanup

TimTheHero · web-flow · commit a56df9df227d · 2025-03-14T12:48:18.000+01:00
Cleanup
diff --git a/AstToEcoreConverter.py b/AstToEcoreConverter.py
@@ -1099,7 +1099,7 @@ def get_method_def_from_internal_structure(self, method_name, module):
                         return current_method[0]
         return None
 
-    def create_method_signature(self, method_node, name, arguments, return_type = None):
+    def create_method_signature(self, method_node, name, arguments, return_type=None):
         """
         Creates a method signature for a method definition.
 
@@ -1134,7 +1134,7 @@ def create_method_signature(self, method_node, name, arguments, return_type = No
 
             # Add type for TParameter.type
             parameter_type = self.create_ecore_instance(NodeTypes.CLASS)
-            #parameter_type.tName = arg.annotation if arg.annotation else 'None'
+            # parameter_type.tName = arg.annotation if arg.annotation else 'None'
             parameter.type = parameter_type
 
         method_node.signature = method_signature
diff --git a/CustomDataset.py b/CustomDataset.py
@@ -9,7 +9,7 @@
 from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \
     convert_hashed_names_to_float
 from Encoder import multi_hot_encoding
-from GraphClasses import defined_labels
+from settings import CONFIG
 
 
 class RepositoryDataset(Dataset):
@@ -30,7 +30,8 @@ def __init__(self, directory, label_list=None):
                 print(e)
         # nodes have 11 features, their one hot encoded node type, hashed name, and one hot encoded library flag
         self.num_node_features = 11
-        self.num_classes = len(defined_labels)
+        self.defined_labels = CONFIG['graph']['defined_labels']
+        self.num_classes = len(self.defined_labels)
         self.directory = directory
         self.graph_names = []
         self.graph_dir = os.listdir(directory)
@@ -162,7 +163,7 @@ def convert_labeled_graphs(self, labels):
             graph_labels)  # count how many repos are in each class
 
         # encode labels
-        encoded_nodes = multi_hot_encoding(defined_labels, graph_labels)
+        encoded_nodes = multi_hot_encoding(self.defined_labels, graph_labels)
         file = zip(graph_names, encoded_nodes)
         return file
 
diff --git a/EcoreToMatrixConverter.py b/EcoreToMatrixConverter.py
@@ -335,7 +335,8 @@ def convert_subpackages_recursive(self, t_package):
             t_package: The package to convert subpackages from.
         """
         for t_subpackage in t_package.subpackages:
-            current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, t_package.tName,
+            current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value,
+                                                            t_package.tName,
                                                             NodeTypes.PACKAGE.value)
             if current_subpackage is None:
                 self.node_matrix.append(NodeTypes.PACKAGE.value)
diff --git a/GCN.py b/GCN.py
@@ -4,6 +4,7 @@
 
 '''defines the architecture of the graph convolutional network'''
 
+
 class GCN(torch.nn.Module):
     def __init__(self, num_node_features, num_classes, hidden_channels):
         super(GCN, self).__init__()
@@ -33,4 +34,4 @@ def forward(self, x, edge_index, edge_attr, batch=None):
         # sigmoid activation function for multi-label
         x = f.sigmoid(x)
 
-        return x
+        return x
diff --git a/GraphClasses.py b/GraphClasses.py
diff --git a/NodeFeatures.py b/NodeFeatures.py
@@ -17,20 +17,20 @@ class NodeTypes(Enum):
     CLASS = "TClass"
     # TMethod
     METHOD = "TMethod"
-    METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented.
-    METHOD_DEFINITION = "TMethodDefinition"# missing "".overloading and "".overloadedBY does not need to be implemented.
+    METHOD_SIGNATURE = "TMethodSignature"  # missing firstParameter does not need to be implemented.
+    METHOD_DEFINITION = "TMethodDefinition"  # missing "".overloading and "".overloadedBY does not need to be implemented.
     PARAMETER = "TParameter"
     # TField
     FIELD = "TField"
-    FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
-    FIELD_DEFINITION = "TFieldDefinition" #  missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
+    FIELD_SIGNATURE = "TFieldSignature"  # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
+    FIELD_DEFINITION = "TFieldDefinition"  # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
     # TAccess
     CALL = "TCall"
     READ = "TRead"  # Todo implement this in AstToEcoreConverter
     WRITE = "TWrite"  # Todo implement this in AstToEcoreConverter
     READ_WRITE = "TReadWrite"  # Todo implement this in AstToEcoreConverter
-    #TInterface
+    # TInterface
     INTERFACE = "TInterface"
     # In Python, there is no formal concept of interfaces as found in some other programming languages like Java or C#.
     # However, Python supports a similar concept through the use of abstract base classes (ABCs) and duck typing.
-    # The return on investment probably is not sufficient to justify the implementation.
+    # The return on investment probably is not sufficient to justify the implementation.
diff --git a/Pipeline.py b/Pipeline.py
@@ -239,7 +239,6 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list
     global repo_multiprocess, ecore_graph
     global node_features, adj_list, edge_attribute
 
-
     # clone repositories for the dataset
     if repository_list is not None:
         download_repositories(repository_directory, repository_list)
diff --git a/README.md b/README.md
@@ -1,27 +1,70 @@
-# github-classifier
+# Classifier for GitHub Repos
 
-**short description**
+## Table of Contents
+- [Intro](#intro)
+- [Installation for Users](#installation-instruction-for-users)
+- [Installation for Devs](#installation-instruction-for-devs)
+- [Expectation for Devs](#expectation-for-devs)
+- [Known Problems / Limitations](#known-problems--limitations)
+- [Help](#help)
 
-This repository contains a deep-learning based classification tool for software repositories. The tool utilizes the ecore metamodel 'type graph' and a graph convolutional network. To use the tool, run 'main.py' after adding the directory containing the repositories you want to classify.
+## Intro:
 
-If you want to train the tool with different labels, replace the current labels with your own (or add them to the labels) in GraphClasses.py, and in function 'multi_hot_encoding' in Encoder.py. Optionally also in function 'count_class_elements' in CustomDataset.py if you want to know the number of samples in each class in your dataset. 
-The labels in the tool are not mutually exclusive and are multi-hot encoded.
+This repository features a deep learning classifier designed for the analysis of software repositories.
+The tool employs the ecore metamodel's 'type graph' in conjunction with a graph convolutional network.
+Presently, the classifier categorizes repositories into four distinct classes: Application, Framework, Library, and Plugin.
+It is important to note that the labels utilized by the tool are **not mutually exclusive** and are represented in a multi-hot encoded format.
 
-Currently, the tool only processes Python files.
+## Installation Instruction for Users:
+1. Clone the repository by executing the following command:  
+`git clone https://github.com/isselab/github-classifier.git`
+2. Open the cloned repository using your preferred Integrated Development Environment (IDE).  
+For the purposes of this instruction, we will assume the use of PyCharm from JetBrains.
+3. Change the directory to data/input by running the following command:  
+`cd ~/data/input`
+4. Clone the repositories you wish to analyze by executing:  
+`git clone LINK_TO_REPO_YOU_WANT`
+5. run main.py
 
-**labels**
+The default threshold for identification is set at 50%.
+If you wish to modify this threshold, please locate the relevant settings in the settings.py file.
+After making the necessary adjustments, ensure to rerun main.py to apply the changes.
 
-Application, Framework, Library, Plugin
+## Installation Instruction for Devs:
 
-**data**
+### Basic Installation:
+1. Clone the repository by executing the following command:  
+`git clone https://github.com/isselab/github-classifier.git`
+2. Open the cloned repository using your preferred Integrated Development Environment (IDE).
 
-Dataset with Python software repositories from GitHub, all with a dependency on at least one ML library.
-The labeled repositories the tool is trained with are in data/labeled_dataset_repos.xlsx.
+### Retraining:
+1. Check data/labeled_dataset_repos.xlsx.  
+This xlsx file contains the labeled repository's the tool is trained with.  
+You may want to change it accordingly to your needs.
+2. We strongly recommend utilizing a GPU for training purposes.  
+To verify GPU availability, please run the TorchGPUCheck.py script.  
+If you get the Result "Cuda is available!" you may proceed to step 3.  
+If the output indicates that "Cuda is not available," please follow the instructions provided in the terminal.      
+Additionally, refer to the guide in the [Help](#help) section for further assistance in resolving any issues.
+3. Run prepareDataset.py
+4. Change the experiment_name in settings.py in the training section.
+5. Run training.py
 
-**requirements**
 
-pyecore~=0.14.0 or higher versions
+## Expectation for Devs:
+### Recommended Workflow:
+1. Create an issue in the GitHub issue page.
+2. Open a branch named after the issue
+3. Write code that fixes the issue
+4. Write test code to be sure it works.
+5.  Comment your code well to be sure it can be understood.
+6. Create a merge request
 
-autopep8
+## Known Problems / Limitations:
+- The Tool only processes Python files.
+- Dataset contains Python software repositories from GitHub, all with a dependency on at least one ML library.
+- Labels can not be changed easily, WIP
 
-GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file" for instructions on how to install the tool
+## Help
+- Torch CUDA Guide, see "https://www.geeksforgeeks.org/how-to-set-up-and-run-cuda-operations-in-pytorch/"
+- GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file"
diff --git a/TorchGPUCheck.py b/TorchGPUCheck.py
@@ -0,0 +1,24 @@
+import torch
+
+"""
+This code is a simple Python script that checks if CUDA is available on the system and provides instructions on how to enable it if it's not available.
+"""
+
+if __name__ == "__main__":
+    print(torch.torch_version)
+    # Check if CUDA is available
+    if torch.cuda.is_available():
+        print("CUDA is available!")
+        print(f"Number of GPUs: {torch.cuda.device_count()}")
+        print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+    else:
+        print("CUDA is not available.")
+        print("To enable CUDA, follow these steps:")
+        print("1. **Install NVIDIA Drivers**: Ensure you have the latest NVIDIA drivers installed on your system.")
+        print(
+            "2. **Install CUDA Toolkit**: Download and install the CUDA Toolkit from the official NVIDIA website: https://developer.nvidia.com/cuda-downloads")
+        print(
+            "3. **Verify CUDA Installation**: After installation, verify that CUDA is working correctly by running the `nvidia-smi` command in your terminal/command prompt.")
+        print(
+            "4. **Update PyTorch**: Make sure you're using the latest version of PyTorch. You can update PyTorch using pip: `pip install --upgrade torch`")
+        print("5. **Restart Your System**: Restart your system to ensure that the changes take effect.")
diff --git a/pep8autoformat.py b/pep8autoformat.py
@@ -1,5 +1,6 @@
 import autopep8
 
+
 def format_python_file(path_to_file):
     try:
         # Read the current content of the file
@@ -21,4 +22,4 @@ def format_python_file(path_to_file):
 if __name__ == "__main__":
     # Specify the file path you want to format
     file_path = 'AstToEcoreConverter.py'
-    format_python_file(file_path)
+    format_python_file(file_path)
diff --git a/torch_gpu_check.py b/torch_gpu_check.py
diff --git a/train.py b/train.py
@@ -11,7 +11,6 @@
 
 from CustomDataset import RepositoryDataset
 from GCN import GCN
-from GraphClasses import defined_labels
 from settings import CONFIG
 
 '''please prepare the dataset you want to train the tool with by using prepareDataset.py,
@@ -27,14 +26,15 @@
 threshold = CONFIG['training']['threshold']
 save_classification_reports = CONFIG['training']['save_classification_reports']
 experiment_name = CONFIG['training']['experiment_name']
+defined_labels = CONFIG['graph']['defined_labels']
 
 
 def train():
     model.train()
 
     num_classes = int(len(defined_labels))
 
-    for graph in tqdm(trainloader,desc = "Training"):
+    for graph in tqdm(trainloader, desc="Training"):
 
         if device == 'cuda':
             graph.x = graph.x.to(device)
@@ -67,7 +67,7 @@ def test(loader):
     total = 0
     num_classes = int(len(defined_labels))
 
-    for graph in tqdm(loader,desc = "Testing"):
+    for graph in tqdm(loader, desc="Testing"):
 
         if device == 'cuda':
             graph.x = graph.x.to(device)