hackathon-aveiro/WebScrapingFB.py at main · doemarques/hackathon-aveiro · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
######################### API STUFF

import requests

# Replace this with your API token
API_TOKEN = 'apify_api_Hui1YjAM1cjLRzmYJ9F65IXyoulxUi4GDNGu'

# Corrected Actor ID for the Facebook Page Scraper
ACTOR_ID = 'apify~facebook-posts-scraper'

# The Facebook page you want to scrape
START_URL = 'https://www.facebook.com/noticiasdeaveiro/?locale=pt_PT'

# API endpoint to trigger the scraper
url = f'https://api.apify.com/v2/acts/{ACTOR_ID}/runs?token={API_TOKEN}'

# Headers
headers = {'Content-Type': 'application/json'}

# Body (scraper configuration)
payload = {
    "startUrls": [
        { "url": START_URL }
    ],
    "maxPosts": 10,       # Max posts to scrape
    "includeComments": False   # Set to True if you want comments
}

# Make the request to start the scraper
response = requests.post(url, headers=headers, json=payload)

# Get the run ID from the response
data = response.json()

# print(type(data))
# print("")
# print(data)


DATASET_ID = 'HZ2Pb7WSKtphMNyKX'
API_TOKEN = 'apify_api_Hui1YjAM1cjLRzmYJ9F65IXyoulxUi4GDNGu'

# API endpoint to get dataset results
url = f'https://api.apify.com/v2/datasets/{DATASET_ID}/items?token={API_TOKEN}'

# Make the request to fetch the results
response = requests.get(url)

# Get the data in JSON format
data = response.json()

# Print the titles of the posts
for post in data:
    # print(post.get('postText', 'No title available'))
    print("-------------------------------------------------------------------------------------------")
    try:
        print('text:'+ post['text'])
    except:
        print('noText')

    try:
        print('previewTitle: '+post['previewTitle'])
    except:
        print('noPreviewTitle')

    try:
        print('previewDescription: '+post['previewDescription'])
    except:
        print('noPreviewDescription')