forked from DDMAL/ddmal.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_parser.py
More file actions
88 lines (72 loc) · 3.08 KB
/
html_parser.py
File metadata and controls
88 lines (72 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''Emily Hopkins adapted to DDMAL needs from the parser Evan Savage
wrote for the SIMSSA site'''
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import os
import markdown
import shutil
print('Media (m,M), presentations (pr, PR), publications (pu, PU), posters (po, PO), or all (a,A)?\n')
choice = str(input()).lower()
input_list = ['m', 'pr', 'pu', 'po', 'a']
full_list = ['media', 'presentations', 'publications', 'posters']
parse_list = []
if choice not in input_list:
print('\nTry again, the input was not valid.\n\n')
exit()
if choice == 'a':
parse_list = full_list
else:
parse_list = [full_list[input_list.index(choice)]]
ddmal_root_folder = './'
export_folder = 'zotero_export/'
# if os.path.exists():
# shutil.rmtree(simssa_root_folder + citation_folder + '/' + year)
for type in parse_list:
html_file_name = f'DDMAL_{type}.html'
citation_folder = f'_{type}'
with open(export_folder + html_file_name) as f:
html_soup = BeautifulSoup(f, 'html.parser')
shutil.rmtree(citation_folder)
os.makedirs(citation_folder)
# Save html (div) and ascii title [ [<div></div>, "Example Title"]]
html_array = []
for html_tag in html_soup.findAll('div', {'class': 'csl-entry'}):
# print(tag.find_next('span'))
# if html_tag.find('a'):
# continue
parse_attr = html_tag.find_next('span')['title']
year = 'n.d.'
author = 'no_author'
title = ')no_title'
a_title = ')no_a_title'
b_title = ')no_b_title'
if 'rft.date' in parse_attr:
year = parse_attr.split('rft.date=')[1].split('-')[0].split('&')[0]
if 'rft.aulast' in parse_attr:
author = unquote(parse_attr.split('rft.aulast=')[1].split('&')[0])
if 'rft.title' in parse_attr:
title = unquote(parse_attr.split('rft.title=')[1].split('&')[0])
if 'rft.atitle' in parse_attr:
a_title = unquote(parse_attr.split('rft.atitle=')[1].split('&')[0])
if 'rft.btitle' in parse_attr:
b_title = unquote(parse_attr.split('rft.btitle=')[1].split('&')[0])
final_title = ''
for t in [title, a_title, b_title]:
if t.split('_')[0] != ')no':
final_title = t
break
final_title = final_title.replace('/', ' ')
file_name = author + '_' + final_title.replace(' ', '_') + '_' + year + '.md'
if not os.path.exists(ddmal_root_folder + citation_folder + '/' + year):
os.makedirs(ddmal_root_folder + citation_folder + '/' + year)
with open(ddmal_root_folder + citation_folder + '/' + year + '/' + file_name, 'w') as f:
f.write(f'---\npresentation_year: {year}\nyear: {year}\n---\n\n{html_tag.decode_contents()}')
print(html_tag.decode_contents(), '\n')
print(parse_attr, '\n')
print('T', final_title, '\n\n')
# print("unsorted")
# for x in html_array: print(x[0], x[1])
html_array = sorted(html_array, key = lambda x: (x[0], x[1]))
# print("\nsorted")
# for x in html_array: print(x[0], x[1], x[2], "\n")