forked from SpinazieSin/UvA-Home
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticle_test2.py
More file actions
72 lines (60 loc) · 2.69 KB
/
Copy patharticle_test2.py
File metadata and controls
72 lines (60 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import articlesearch
import newsextractor
import keywords
import numpy as np# !not necessary for production!
from nltk.stem.snowball import SnowballStemmer
class Articletest(object):
def __init__(self, articles):
self.searcher = articlesearch.ArticleSearch(articles)
self.keys = keywords.KeyWords()
self.stemmer = SnowballStemmer("english")
def get_keywords(self, article):
return set(self.keys.extract_top(article.text.encode('utf-8')))
def article_suggester(self, article):
print("\n==========================================")
print("Title: " + article.title)
articlekeys = self.get_keywords(article)
results = []
# Werkt nog niet goed vanwege onderstaande;
# keys van huidig artikel worden wel gestemt en die van target artikelen
# niet.
# articlekeys = {self.stemmer.stem(key) for key in articlekeys}
for k in articlekeys:
arts = self.searcher.search(k)
# Check if we can find sufficient articles for this term.
# 0 articles would give divide by 0 and 1 article would mean
# we can only find this article rendering the term useless.
if(len(arts) > 1):
s = 0.
t = len(arts) * len(articlekeys)
for arti in self.searcher.search(k):
s += len(articlekeys) - len(articlekeys - self.get_keywords(arti[0]))
results += [(k, s / t)]
else:
results += [(k, 0)]
results = sorted(results, key=lambda x: x[1])[::-1]
print("==========================================")
print("Keywords and ranking:")
maxlen = len(max([keypair[0] for keypair in results], key=len))
for keypair in results:
print("* {0:{width}} {1}".format(keypair[0], keypair[1], width=maxlen))
# print("* " + keypair[0] + " " + str(keypair[1]))
print("==========================================")
top_terms = [pair[0] for pair in results if pair[1] > .1]
if len(top_terms) < 3:
top_terms = [pair[0] for pair in results[:3]]
print("The relevant news query is:\n'" + " ".join(top_terms) + "'")
print("Top 10 related news articles:")
for art in self.searcher.search(" ".join(top_terms))[:10]:
print("* " + art[0].title)
return article
def main():
# setup replica class etc
articles = newsextractor.NewsExtractor()
articles.build_all()
tester = Articletest(articles)
article_list = articles.news
for i in np.random.randint(0, 500, 10):
result = tester.article_suggester(article_list[i])
if __name__=="__main__":
main()