-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject_web_scraper.ruff
More file actions
337 lines (286 loc) · 10 KB
/
Copy pathproject_web_scraper.ruff
File metadata and controls
337 lines (286 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env ruff
# Web Scraper - Extract Data from Websites
# Showcases: HTTP requests, regex patterns, data extraction, JSON export, error handling
parser := arg_parser()
parser.add_argument("--url", "-u", type="string", required=true, help="URL to scrape")
parser.add_argument("--pattern", "-p", type="string", help="Regex pattern to extract")
parser.add_argument("--selector", "-s", type="string", help="CSS-like selector (simple)")
parser.add_argument("--output", "-o", type="string", help="Output JSON file")
parser.add_argument("--follow-links", "-f", type="bool", help="Follow and scrape links")
parser.add_argument("--max-depth", "-d", type="int", default=1, help="Maximum link depth")
parser.add_argument("--timeout", "-t", type="float", default=10.0, help="Request timeout")
args := parser.parse()
# Scraping results
results := {
"url": args._url,
"timestamp": timestamp(),
"pages_scraped": 0,
"data_extracted": [],
"links_found": [],
"errors": []
}
# Fetch page content
func fetch_page(url, timeout) {
print("🌐 Fetching: " + url)
options := {
"method": "GET",
"timeout": timeout,
"headers": {
"User-Agent": "Ruff-Scraper/1.0"
}
}
result := http_request(url, options)
match result {
case Ok(response): {
if response._status == 200 {
return Ok(response._body)
} else {
return Err("HTTP " + to_string(response._status))
}
}
case Err(error): {
return Err(error)
}
}
}
# Extract data using regex pattern
func extract_with_pattern(content, pattern) {
if pattern == null {
return []
}
matches := regex_find_all(content, pattern)
return matches
}
# Extract links from HTML
func extract_links(content, base_url) {
# Simple link extraction (href="..." or href='...')
link_pattern := "href=[\"']([^\"']+)[\"']"
matches := regex_find_all(content, link_pattern)
links := []
for match in matches {
# Clean up the match (remove href=" and ")
link := match
if contains(link, "href") {
# Extract URL from href attribute
parts := split(link, "\"")
if length(parts) >= 2 {
url := parts[1]
# Skip anchors and javascript
if !starts_with(url, "#") and !starts_with(url, "javascript:") {
# Make absolute if relative
if starts_with(url, "http") {
push(links, url)
} else if starts_with(url, "/") {
# Relative to domain
domain := extract_domain(base_url)
push(links, domain + url)
}
}
}
}
}
return links
}
# Extract domain from URL
func extract_domain(url) {
# Extract protocol and domain (http://example.com or https://example.com)
if contains(url, "://") {
parts := split(url, "://")
if length(parts) >= 2 {
protocol := parts[0]
rest := parts[1]
domain_parts := split(rest, "/")
return protocol + "://" + domain_parts[0]
}
}
return url
}
# Extract structured data (simple table parsing)
func extract_tables(content) {
tables := []
# Look for table-like patterns
# This is simplified - a real implementation would use an HTML parser
table_pattern := "<table[^>]*>(.*?)</table>"
row_pattern := "<tr[^>]*>(.*?)</tr>"
cell_pattern := "<td[^>]*>(.*?)</td>"
# Note: This is a simplified example
# Real HTML parsing would require proper parser
return tables
}
# Extract emails
func extract_emails(content) {
email_pattern := "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"
emails := regex_find_all(content, email_pattern)
# Deduplicate
unique_emails := {}
for email in emails {
unique_emails[email] = true
}
return keys(unique_emails)
}
# Extract phone numbers
func extract_phones(content) {
# Match common phone formats
phone_patterns := [
"\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", # (555) 123-4567
"\\d{3}-\\d{3}-\\d{4}", # 555-123-4567
"\\d{3}\\.\\d{3}\\.\\d{4}", # 555.123.4567
"\\+\\d{1,3}\\s*\\d{3}\\s*\\d{3}\\s*\\d{4}" # +1 555 123 4567
]
phones := []
for pattern in phone_patterns {
matches := regex_find_all(content, pattern)
for match in matches {
push(phones, match)
}
}
return phones
}
# Scrape single page
func scrape_page(url, pattern, extract_links_flag) {
page_result := fetch_page(url, args._timeout)
match page_result {
case Err(error): {
push(results._errors, {"url": url, "error": error})
return Err(error)
}
case Ok(content): {
results._pages_scraped = results._pages_scraped + 1
page_data := {
"url": url,
"content_length": length(content),
"extracted": []
}
# Extract with custom pattern
if pattern != null {
matches := extract_with_pattern(content, pattern)
page_data._pattern_matches = matches
print(" ✓ Found " + to_string(length(matches)) + " pattern matches")
}
# Extract common data types
emails := extract_emails(content)
if length(emails) > 0 {
page_data._emails = emails
print(" 📧 Found " + to_string(length(emails)) + " email(s)")
}
phones := extract_phones(content)
if length(phones) > 0 {
page_data._phones = phones
print(" 📞 Found " + to_string(length(phones)) + " phone(s)")
}
# Extract links
if extract_links_flag {
links := extract_links(content, url)
page_data._links = links
for link in links {
if !contains_url(results._links_found, link) {
push(results._links_found, link)
}
}
print(" 🔗 Found " + to_string(length(links)) + " link(s)")
}
push(results._data_extracted, page_data)
return Ok(page_data)
}
}
}
# Check if URL already in list
func contains_url(list, url) {
for item in list {
if item == url {
return true
}
}
return false
}
# Save results to JSON
func save_results(output_file) {
if output_file == null {
return null
}
print("")
print("💾 Saving results to: " + output_file)
json_str := to_json(results)
result := write_file(output_file, json_str)
match result {
case Ok(_): {
print("✅ Results saved successfully")
}
case Err(error): {
print("❌ Failed to save results: " + error)
}
}
}
# Display summary
func display_summary() {
print("")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print("📊 Scraping Summary")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print("")
print("Pages scraped: " + to_string(results._pages_scraped))
print("Links found: " + to_string(length(results._links_found)))
if length(results._errors) > 0 {
print("Errors: " + to_string(length(results._errors)))
}
# Count total extracted items
total_emails := 0
total_phones := 0
total_matches := 0
for page in results._data_extracted {
if has_key(page, "emails") {
total_emails = total_emails + length(page._emails)
}
if has_key(page, "phones") {
total_phones = total_phones + length(page._phones)
}
if has_key(page, "pattern_matches") {
total_matches = total_matches + length(page._pattern_matches)
}
}
print("")
print("Data extracted:")
if total_matches > 0 {
print(" Pattern matches: " + to_string(total_matches))
}
if total_emails > 0 {
print(" Email addresses: " + to_string(total_emails))
}
if total_phones > 0 {
print(" Phone numbers: " + to_string(total_phones))
}
print("")
}
# Main execution
print("")
print("🕷️ Web Scraper")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print("")
# Scrape initial page
scrape_result := scrape_page(args._url, args._pattern, args._follow_links)
match scrape_result {
case Ok(_): {
# If follow-links enabled, scrape linked pages
if args._follow_links and length(results._links_found) > 0 {
print("")
print("🔗 Following links (max depth: " + to_string(args._max_depth) + ")...")
# Limit to reasonable number to avoid infinite loops
max_links := 10
links_to_scrape := min(length(results._links_found), max_links)
for i in range(0, links_to_scrape) {
link := results._links_found[i]
if link != args._url {
scrape_page(link, args._pattern, false)
}
}
}
display_summary()
save_results(args._output)
print("🎉 Scraping completed successfully!")
}
case Err(error): {
print("❌ Scraping failed: " + error)
display_summary()
exit(1)
}
}