1313from licensedcode .tokenize import query_lines
1414from samecode .halohash import BitAverageHaloHash
1515
16+ from matchcode_toolkit .stemming import get_stem_code
17+
1618# A collection of directory fingerprints that we want to avoid
1719IGNORED_DIRECTORY_FINGERPRINTS = [
1820 # This is both the directory content and directory structure fingerprint for
@@ -224,28 +226,81 @@ def get_file_fingerprint_hashes(
224226 content = f .read ()
225227
226228 return create_file_fingerprints (
227- content ,
229+ content = content ,
230+ ngram_length = ngram_length ,
231+ window_length = window_length ,
232+ include_ngrams = include_ngrams ,
233+ )
234+
235+
236+ def get_stemmed_file_fingerprint_hashes (
237+ location ,
238+ ngram_length = 5 ,
239+ window_length = 16 ,
240+ include_ngrams = False ,
241+ ** kwargs ,
242+ ):
243+ """
244+ Return a mapping of stemmed code fingerprint hashes for the file at `location`
245+
246+ The `halo1` hash is the hex digest of the fingerprint of the file.
247+ `halo1` is empty if the file is empty.
248+
249+ - We start by breaking the file into words (tokens)
250+ - We compute ngrams over the list of tokens
251+
252+ Return an empty mapping if `location` is not a text file
253+ """
254+ from commoncode import filetype
255+ from typecode .contenttype import get_type
256+
257+ # Do not process `location` if it's not a text file
258+ ft = get_type (location )
259+ if not (filetype .is_file (location ) and ft .is_text ):
260+ return {}
261+
262+ stemmed_content = get_stem_code (location = location )
263+
264+ return create_file_fingerprints (
265+ stemmed_content = stemmed_content ,
228266 ngram_length = ngram_length ,
229267 window_length = window_length ,
230268 include_ngrams = include_ngrams ,
231269 )
232270
233271
234272def create_file_fingerprints (
235- content , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
273+ content = None ,
274+ stemmed_content = None ,
275+ ngram_length = 5 ,
276+ window_length = SNIPPET_WINDOW_LENGTH ,
277+ include_ngrams = False ,
236278):
237279 """
238- Return a mapping of halo1 and snippet hashes from content string
280+ Return a mapping of halo1 and snippet hashes from ` content` or `stemmed_content`, not both.
239281 """
240282 from licensedcode .tokenize import ngrams
241283 from licensedcode .tokenize import select_ngrams
242284
285+ if content and stemmed_content :
286+ raise Exception (
287+ "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
288+ )
289+
290+ if stemmed_content :
291+ halo1_key = "stemmed_halo1"
292+ snippets_key = "stemmed_snippets"
293+ else :
294+ halo1_key = "halo1"
295+ snippets_key = "snippets"
296+
243297 fingerprints = {
244- "halo1" : "" ,
245- "snippets" : [],
298+ halo1_key : "" ,
299+ snippets_key : [],
246300 }
247301
248302 # tokenize content into words
303+ content = content or stemmed_content
249304 words = list (tokenizer (content ))
250305
251306 # Create a file fingerprint from the number of elements in the content hash
@@ -259,7 +314,7 @@ def create_file_fingerprints(
259314 content_fingerprint = content_hash .hexdigest ().decode ("utf-8" )
260315 ngs_count_hex_str = "%08x" % ngs_count
261316 file_fingerprint = ngs_count_hex_str + content_fingerprint
262- fingerprints ["halo1" ] = file_fingerprint
317+ fingerprints [halo1_key ] = file_fingerprint
263318
264319 # Select windows from the content to compute snippet fingerprints
265320 windows = ngrams (words , window_length )
@@ -279,7 +334,7 @@ def create_file_fingerprints(
279334 s ["ngrams" ] = list (window )
280335 snippets .append (s )
281336 if snippets :
282- fingerprints ["snippets" ] = snippets
337+ fingerprints [snippets_key ] = snippets
283338
284339 return fingerprints
285340
0 commit comments