-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathfuzzy_categorize_documents.py
More file actions
83 lines (65 loc) · 3.32 KB
/
fuzzy_categorize_documents.py
File metadata and controls
83 lines (65 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import shutil
import configparser
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# Read settings from ini file or use defaults
config = configparser.ConfigParser()
if os.path.exists("settings.ini"):
config.read("settings.ini")
TXT_DOCUMENTS = config.get("paths", "txt_documents")
SIMILARITY_THRESHOLD = config.getfloat("similarity", "similarity_threshold")
CATEGORY_DEFINITIONS = dict(config.items("categories"))
else:
# Use defaults if settings.ini doesn't exist
print("Warning: settings.ini not found. Using default values.")
TXT_DOCUMENTS = os.path.join(os.getcwd(), "txt_output")
SIMILARITY_THRESHOLD = 70.0
CATEGORY_DEFINITIONS = {
"legal": "legal document contract agreement",
"financial": "financial invoice receipt payment",
"technical": "technical specification documentation"
}
print(f"Default path: {TXT_DOCUMENTS}")
print(f"Default similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"Default categories: {list(CATEGORY_DEFINITIONS.keys())}")
def extract_text_from_txt(file_path):
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
return text
def move_file_to_subfolder(file_path, category):
category_folder = os.path.join(os.path.dirname(file_path), category)
os.makedirs(category_folder, exist_ok=True)
new_file_path = os.path.join(category_folder, f"{category}_{os.path.basename(file_path)}")
shutil.move(file_path, new_file_path)
def find_most_similar_category(text, categories, threshold):
# Change the scorer here to try different matching algorithms.
# Option 1: fuzz.token_set_ratio
# most_similar_category, highest_similarity = process.extractOne(text, categories.values(), scorer=fuzz.token_set_ratio)
# Option 2: fuzz.token_sort_ratio
most_similar_category, highest_similarity = process.extractOne(text, categories.values(), scorer=fuzz.token_sort_ratio)
similarities = process.extract(text, categories.values(), scorer=fuzz.token_sort_ratio)
for cat_def, score in similarities:
cat_name = [k for k, v in categories.items() if v == cat_def][0]
print(f"Similarity between document and category '{cat_name}': {score}")
if highest_similarity >= threshold:
return [k for k, v in categories.items() if v == most_similar_category][0]
else:
return None
if __name__ == "__main__":
with open("exception.txt", "w", encoding="utf-8") as exception_file:
for filename in os.listdir(TXT_DOCUMENTS):
if filename.endswith(".txt"):
file_path = os.path.join(TXT_DOCUMENTS, filename)
txt_text = extract_text_from_txt(file_path)
most_similar_category = find_most_similar_category(txt_text, CATEGORY_DEFINITIONS, SIMILARITY_THRESHOLD)
print(
f"Processing {filename}: Most similar category: {most_similar_category}"
)
if most_similar_category is not None:
move_file_to_subfolder(file_path, most_similar_category)
print(f"{filename} categorized as {most_similar_category}")
else:
exception_file.write(f"{filename}\n")
print(f"{filename} NOT categorized (below threshold)")
print("Categorization process completed.")