forked from SpinazieSin/UvA-Home
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticlesearch.py
More file actions
138 lines (119 loc) · 4.81 KB
/
Copy patharticlesearch.py
File metadata and controls
138 lines (119 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
import math
import operator
import newsextractor
import datetime
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from difflib import SequenceMatcher
class ArticleSearch(object):
"""
Searches a given list of articles for a search term with some parameters
"""
def __init__(self, article_list, sources=None):
"""
Init
@param article_list List of articles generated by NewsExtractor().build_all()
"""
self.stemmer = SnowballStemmer("english")
self.article_list = article_list.news
self.word = re.compile(r'\w+')
# an empty search term should let it return all articles, so only the other filters are used
# This should at one point be extend to deal with multiple keywords
# 'cat' stands for category
def search(self, term1="", term2="", cat1=None, cat2=None, date1=None, date2=None, place=None,
source1=None, source2=None):
"""
Search function that handles parameters
@param search_term The string to be searched in tags and titles of the articles
@param date1 Longest date
@param date2 Latest date
@param place Location of news
@param sources What sources the news will be returned from
"""
# if sources is None:
# sources = newsextractor.NewsExtractor().supported_news_papers
if term1 == None:
term1 = ""
min_date = datetime.datetime.fromtimestamp(0) if date1 is None else date1
max_date = datetime.datetime.now() if date2 is None else date2
search_term_vec = self.text_to_vector(term1.lower())
search_term_stemmed = []
for term in search_term_vec:
stem = self.stemmer.stem(term)
if len(stem) > 1:
search_term_stemmed.append(stem)
else:
search_term_stemmed.append(term)
normalize_score = 0
scored_articles = []
for article in self.article_list:
# filters
if article.published == '' or not (min_date <= article.published.replace(tzinfo=None) <= max_date):
continue
if not article.source == source1 and source1 is not None:
continue
if cat1 is not None: # Check if the category satifies
if article.category != cat1:
continue
if place is not None and (term1=="" or term1 is None):
place = term1
# search_term_stemmed.append(place)
# place_found = False
# for k in article.keywords: # search the full text maybe?
# if self.place.substring(k.lower()):
# place_found = True
# break
# if not place_found:
# break
if not term1 == "":
highest_score = self.similar(search_term_stemmed, article.term_count)
if highest_score > 0:
scored_articles.append([article, highest_score, highest_score])
if highest_score > normalize_score:
normalize_score = highest_score
else:
scored_articles.append([article, 1])
if normalize_score > 0:
for index in range(len(scored_articles)):
scored_articles[index][1] /= normalize_score
return sorted(scored_articles, key=operator.itemgetter(1), reverse=True)
def text_to_list(self, text):
global word
return word.findall(text)
def get_cosine(self, vec1, vec2):
"""
Get the similarity between the strings vec1 and vec2.
@param vec1 string 1
@param vec2 string 2
"""
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([(vec1[x] * vec2[x]) for x in intersection])
return float(numerator) / denominator
def text_to_vector(self, text):
"""
Split string with /w
@param text String to be split
"""
return Counter(self.word.findall(text)) # cosine similarity method
def similar(self, count1, count2):
"""
Count the similarity between the two Counter objects.
Unique elements are scored higher than repeated elements.
@param word1 string1
@param word2 string2
"""
score = 0.0
for term in count1:
temp_score = count2[term]
if temp_score > 1:
score += 1 + (float(temp_score)/100)
else:
score += temp_score
return score