UvA-Home/articlesearch.py at master · IntelligentRoboticsLab/UvA-Home · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
import math
import operator
import newsextractor
import datetime
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from difflib import SequenceMatcher


class ArticleSearch(object):
    """
    Searches a given list of articles for a search term with some parameters
    """

    def __init__(self, article_list, sources=None):
        """
        Init
        @param article_list List of articles generated by NewsExtractor().build_all()
        """
        self.stemmer = SnowballStemmer("english")
        self.article_list = article_list.news
        self.word = re.compile(r'\w+')

    # an empty search term should let it return all articles, so only the other filters are used
    # This should at one point be extend to deal with multiple keywords
    # 'cat' stands for category
    def search(self, term1="", term2="", cat1=None, cat2=None, date1=None, date2=None, place=None,
    source1=None, source2=None):
        """
        Search function that handles parameters
        @param search_term The string to be searched in tags and titles of the articles
        @param date1 Longest date
        @param date2 Latest date
        @param place Location of news
        @param sources What sources the news will be returned from
        """
#        if sources is None:
#            sources = newsextractor.NewsExtractor().supported_news_papers

        if term1 == None:
            term1 = ""

        min_date = datetime.datetime.fromtimestamp(0) if date1 is None else date1
        max_date = datetime.datetime.now() if date2 is None else date2

        search_term_vec = self.text_to_vector(term1.lower())
        search_term_stemmed = []
        for term in search_term_vec:
            stem = self.stemmer.stem(term)
            if len(stem) > 1:
                search_term_stemmed.append(stem)
            else:
                search_term_stemmed.append(term)

        normalize_score = 0

        scored_articles = []
        for article in self.article_list:
            # filters
            if article.published == '' or not (min_date <= article.published.replace(tzinfo=None) <= max_date):
                continue

            if not article.source == source1 and source1 is not None:
                continue
            if cat1 is not None: # Check if the category satifies

                if article.category != cat1:
                    continue
            if place is not None and (term1=="" or term1 is None):
                place = term1
#                search_term_stemmed.append(place)
                # place_found = False
                # for k in article.keywords: # search the full text maybe?
                #     if self.place.substring(k.lower()):
                #         place_found = True
                #         break
                # if not place_found:
                #     break


            if not term1 == "":
                highest_score = self.similar(search_term_stemmed, article.term_count)
                if highest_score > 0:
                    scored_articles.append([article, highest_score, highest_score])
                if highest_score > normalize_score:
                    normalize_score = highest_score
            else:
                scored_articles.append([article, 1])

        if normalize_score > 0:
            for index in range(len(scored_articles)):
                scored_articles[index][1] /= normalize_score
        return sorted(scored_articles, key=operator.itemgetter(1), reverse=True)

    def text_to_list(self, text):
        global word
        return word.findall(text)

    def get_cosine(self, vec1, vec2):
        """
        Get the similarity between the strings vec1 and vec2.
        @param vec1 string 1
        @param vec2 string 2
        """
        sum1 = sum([vec1[x]**2 for x in vec1.keys()])
        sum2 = sum([vec2[x]**2 for x in vec2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            intersection = set(vec1.keys()) & set(vec2.keys())
            numerator = sum([(vec1[x] * vec2[x]) for x in intersection])
            return float(numerator) / denominator

    def text_to_vector(self, text):
        """
        Split string with /w
        @param text String to be split
        """
        return Counter(self.word.findall(text)) # cosine similarity method

    def similar(self, count1, count2):
        """
        Count the similarity between the two Counter objects.
        Unique elements are scored higher than repeated elements.
        @param word1 string1
        @param word2 string2
        """
        score = 0.0
        for term in count1:
            temp_score = count2[term]
            if temp_score > 1:
                score += 1 + (float(temp_score)/100)
            else:
                score += temp_score
        return score