ddmal.github.io/html_parser.py at master · sabrina0822/ddmal.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''Emily Hopkins adapted to DDMAL needs from the parser Evan Savage
wrote for the SIMSSA site'''

from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import os
import markdown
import shutil

print('Media (m,M), presentations (pr, PR), publications (pu, PU), posters (po, PO), or all (a,A)?\n')
choice = str(input()).lower()

input_list = ['m', 'pr', 'pu', 'po', 'a']
full_list = ['media', 'presentations', 'publications', 'posters']
parse_list = []

if choice not in input_list:
    print('\nTry again, the input was not valid.\n\n')
    exit()
if choice == 'a':
    parse_list = full_list
else:
    parse_list = [full_list[input_list.index(choice)]]

ddmal_root_folder = './'
export_folder = 'zotero_export/'

# if os.path.exists():
#     shutil.rmtree(simssa_root_folder + citation_folder + '/' + year)

for type in parse_list:

    html_file_name = f'DDMAL_{type}.html'
    citation_folder = f'_{type}'

    with open(export_folder + html_file_name) as f:
        html_soup = BeautifulSoup(f, 'html.parser')

    shutil.rmtree(citation_folder)
    os.makedirs(citation_folder)
    # Save html (div) and ascii title [ [<div></div>, "Example Title"]]

    html_array = []

    for html_tag in html_soup.findAll('div', {'class': 'csl-entry'}):
        # print(tag.find_next('span'))
        # if html_tag.find('a'):
        #     continue
        parse_attr = html_tag.find_next('span')['title']
        year = 'n.d.'
        author = 'no_author'
        title = ')no_title'
        a_title = ')no_a_title'
        b_title = ')no_b_title'
        if 'rft.date' in parse_attr:
            year = parse_attr.split('rft.date=')[1].split('-')[0].split('&')[0]
        if 'rft.aulast' in parse_attr:
            author = unquote(parse_attr.split('rft.aulast=')[1].split('&')[0])
        if 'rft.title' in parse_attr:
            title = unquote(parse_attr.split('rft.title=')[1].split('&')[0])
        if 'rft.atitle' in parse_attr:
            a_title = unquote(parse_attr.split('rft.atitle=')[1].split('&')[0])
        if 'rft.btitle' in parse_attr:
            b_title = unquote(parse_attr.split('rft.btitle=')[1].split('&')[0])

        final_title = ''
        for t in [title, a_title, b_title]:
            if t.split('_')[0] != ')no':
                final_title = t
                break
        final_title = final_title.replace('/', ' ')
        file_name = author + '_' + final_title.replace(' ', '_') + '_' + year + '.md'

        if not os.path.exists(ddmal_root_folder + citation_folder + '/' + year):
            os.makedirs(ddmal_root_folder + citation_folder + '/' + year)
        with open(ddmal_root_folder + citation_folder + '/' + year + '/' + file_name, 'w') as f:
            f.write(f'---\npresentation_year: {year}\nyear: {year}\n---\n\n{html_tag.decode_contents()}')

        print(html_tag.decode_contents(), '\n')
        print(parse_attr, '\n')
        print('T', final_title, '\n\n')

    # print("unsorted")
    # for x in html_array: print(x[0], x[1])
    html_array = sorted(html_array, key = lambda x: (x[0], x[1]))
    # print("\nsorted")
    # for x in html_array: print(x[0], x[1], x[2], "\n")