From 15450eb811cd0d20d456398e6474db0cafc795f3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 14 May 2025 10:28:06 +0800 Subject: [PATCH 1/3] Add Chinese language support with English as default --- web/static/css/main.css | 48 ++++++- web/static/templates/index.html | 218 ++++++++++++++++++++++++++++- web/static/templates/job_row.html | 18 ++- web/static/templates/job_rows.html | 21 ++- 4 files changed, 292 insertions(+), 13 deletions(-) diff --git a/web/static/css/main.css b/web/static/css/main.css index 9e59f37d..83f7302d 100644 --- a/web/static/css/main.css +++ b/web/static/css/main.css @@ -269,7 +269,17 @@ th { } .content { - padding: 20px; + padding: 16px; + } + + nav { + flex-direction: column; + align-items: flex-start; + } + + .language-selector { + margin-left: 0; + margin-top: 10px; } } @@ -333,8 +343,40 @@ th { } nav { - margin-bottom: 16px; - padding: 8px 0; + display: flex; + align-items: center; + margin-top: 16px; + justify-content: space-between; +} + +.nav-links { + display: flex; + gap: 20px; +} + +.language-selector { + margin-left: 15px; +} + +.language-selector select { + padding: 8px 12px; + border-radius: 4px; + border: 1px solid var(--color-border); + background-color: var(--color-surface); + font-size: 14px; + color: var(--color-text); + cursor: pointer; + transition: border-color 0.2s, box-shadow 0.2s; +} + +.language-selector select:hover { + border-color: var(--color-primary-light); +} + +.language-selector select:focus { + outline: none; + border-color: var(--color-primary); + box-shadow: 0 0 0 2px rgba(74, 74, 74, 0.2); } nav a { diff --git a/web/static/templates/index.html b/web/static/templates/index.html index d289ba1b..a58c0c5d 100644 --- a/web/static/templates/index.html +++ b/web/static/templates/index.html @@ -12,7 +12,15 @@

Google Maps Scraper

If you find this tool useful, please consider starring our repository:

@@ -154,6 +162,214 @@

Google Maps Scraper

sponsorSection.style.display = 'none'; } } + +// Ensure English is the default language +function ensureDefaultLanguage() { + // Check if language cookie exists + const cookies = document.cookie.split(';'); + let languageCookieExists = false; + let currentLanguage = ''; + + for (let i = 0; i < cookies.length; i++) { + const cookie = cookies[i].trim(); + if (cookie.startsWith('language=')) { + languageCookieExists = true; + currentLanguage = cookie.substring('language='.length); + break; + } + } + + // If no language cookie exists, or language is invalid, set to English + if (!languageCookieExists || (currentLanguage !== 'en' && currentLanguage !== 'zh')) { + document.cookie = "language=en; path=/; max-age=31536000"; // 1 year expiration + } +} + +// Call function when page loads +ensureDefaultLanguage(); + +function changeLanguage(lang) { + // Set a cookie to store the language preference + document.cookie = "language=" + lang + "; path=/; max-age=31536000"; // 1 year expiration + // Reload the page to apply the language change + window.location.reload(); +} + +// Translations for UI elements +const translations = { + en: { + "title": "Google Maps Scraper", + "apiDocs": "API Documentation", + "starGithub": "If you find this tool useful, please consider starring our repository:", + "starButton": "Star on GitHub", + "sponsorText": "Support the ongoing maintenance and development of this project by becoming a sponsor. Your contribution helps ensure the tool's continued improvement and reliability.", + "sponsorButton": "Sponsor on GitHub", + "jobDetails": "Job Details", + "jobName": "Job Name:", + "keywords": "Keywords:", + "language": "Language:", + "locationSettings": "Location Settings", + "zoom": "Zoom:", + "latitude": "Latitude:", + "longitude": "Longitude:", + "advancedOptions": "Advanced Options", + "fastMode": "Fast Mode (BETA):", + "radius": "Radius (BETA):", + "depth": "Depth:", + "fetchEmails": "Fetch Emails", + "maxJobTime": "Max job time:", + "proxies": "Proxies:(one per line)", + "proxyExamples": "Examples:", + "httpsProxyExample": "HTTPS proxy with username/password: https://username:password@myproxy.local:443", + "httpProxyExample": "HTTP proxy with username/password: http://username:password@myproxy.local:443", + "socksProxyExample": "SOCKS5 proxy without auth: socks5://127.0.0.1:8000", + "startScraping": "Start Scraping", + "jobID": "Job ID", + "jobName": "Job Name", + "jobDate": "Job Date", + "status": "Status", + "actions": "Actions" + }, + zh: { + "title": "谷歌地图数据抓取工具", + "apiDocs": "API文档", + "starGithub": "如果您觉得此工具有用,请考虑为我们的仓库点赞:", + "starButton": "在GitHub上点赞", + "sponsorText": "通过成为赞助商来支持这个项目的持续维护和开发。您的贡献有助于确保工具的持续改进和可靠性。", + "sponsorButton": "在GitHub上赞助", + "jobDetails": "任务详情", + "jobName": "任务名称:", + "keywords": "关键词:", + "language": "语言:", + "locationSettings": "位置设置", + "zoom": "缩放级别:", + "latitude": "纬度:", + "longitude": "经度:", + "advancedOptions": "高级选项", + "fastMode": "快速模式(测试版):", + "radius": "半径(测试版):", + "depth": "深度:", + "fetchEmails": "获取电子邮件", + "maxJobTime": "最长任务时间:", + "proxies": "代理服务器:(每行一个)", + "proxyExamples": "示例:", + "httpsProxyExample": "带用户名/密码的HTTPS代理: https://username:password@myproxy.local:443", + "httpProxyExample": "带用户名/密码的HTTP代理: http://username:password@myproxy.local:443", + "socksProxyExample": "不带验证的SOCKS5代理: socks5://127.0.0.1:8000", + "startScraping": "开始抓取", + "jobID": "任务ID", + "jobName": "任务名称", + "jobDate": "任务日期", + "status": "状态", + "actions": "操作" + } +}; + +// Set the language selector to match the current language and update UI text +document.addEventListener('DOMContentLoaded', function() { + // Try to get the language from the cookie + const cookies = document.cookie.split(';'); + let currentLang = 'en'; // Default to English + + for (let i = 0; i < cookies.length; i++) { + const cookie = cookies[i].trim(); + if (cookie.startsWith('language=')) { + const cookieLang = cookie.substring('language='.length); + // Only accept valid languages, otherwise default to English + if (cookieLang === 'zh' || cookieLang === 'en') { + currentLang = cookieLang; + } + break; + } + } + + // Update the select element to show the current language + const languageSelect = document.getElementById('language-select'); + if (languageSelect) { + languageSelect.value = currentLang; + } + + // Update text elements with translations + updateUIText(currentLang); +}); + +function updateUIText(lang) { + // Get the translations for the selected language + const trans = translations[lang] || translations.en; + + // Update the document title + document.title = trans.title; + + // Update header elements + document.querySelector('header h1').textContent = trans.title; + document.querySelector('header nav a').textContent = trans.apiDocs; + document.querySelector('.github-section p').textContent = trans.starGithub; + document.querySelector('.github-button').textContent = trans.starButton; + document.querySelector('.sponsor-text').textContent = trans.sponsorText; + document.querySelector('.sponsor-button').textContent = trans.sponsorButton; + + // Update form elements + document.querySelector('legend').textContent = trans.jobDetails; + document.querySelector('label[for="name"]').textContent = trans.jobName; + document.querySelector('label[for="keywords"]').textContent = trans.keywords; + document.querySelector('label[for="lang"]').textContent = trans.language; + + // Update expandable sections + const summaries = document.querySelectorAll('summary'); + summaries[0].textContent = trans.locationSettings; + summaries[1].textContent = trans.advancedOptions; + + // Update location settings + document.querySelector('label[for="zoom"]').textContent = trans.zoom; + document.querySelector('label[for="latitude"]').textContent = trans.latitude; + document.querySelector('label[for="longitude"]').textContent = trans.longitude; + + // Update advanced options + document.querySelector('label[for="fastmode"]').textContent = trans.fastMode; + document.querySelector('label[for="radius"]').textContent = trans.radius; + document.querySelector('label[for="depth"]').textContent = trans.depth; + document.querySelector('label[for="email"]').textContent = trans.fetchEmails; + document.querySelector('label[for="maxtime"]').textContent = trans.maxJobTime; + + // Update proxies section + document.querySelector('label[for="proxies"]').textContent = trans.proxies; + document.querySelector('.text-muted small').textContent = trans.proxyExamples; + + // Update proxy examples + const proxyExampleParagraphs = document.querySelectorAll('.text-muted p'); + if (proxyExampleParagraphs.length >= 3) { + proxyExampleParagraphs[0].textContent = trans.httpsProxyExample; + proxyExampleParagraphs[1].textContent = trans.httpProxyExample; + proxyExampleParagraphs[2].textContent = trans.socksProxyExample; + } + + // Update button text + document.querySelector('button[type="submit"]').textContent = trans.startScraping; + + // Update table headers + const tableHeaders = document.querySelectorAll('#job-table th'); + tableHeaders[0].textContent = trans.jobID; + tableHeaders[1].textContent = trans.jobName; + tableHeaders[2].textContent = trans.jobDate; + tableHeaders[3].textContent = trans.status; + tableHeaders[4].textContent = trans.actions; +} + + diff --git a/web/static/templates/job_row.html b/web/static/templates/job_row.html index fe1d2d0d..cb36a48a 100644 --- a/web/static/templates/job_row.html +++ b/web/static/templates/job_row.html @@ -3,16 +3,26 @@ {{.Name}} {{.Date}} - {{.Status}} + + {{if eq .Lang "zh"}} + {{if eq .Status "ok"}}完成{{else if eq .Status "running"}}运行中{{else if eq .Status "error"}}错误{{else if eq .Status "canceled"}}已取消{{else}}{{.Status}}{{end}} + {{else}} + {{.Status}} + {{end}} + {{ if eq .Status "ok" }} - Download + + {{if eq .Lang "zh"}}下载{{else}}Download{{end}} + {{ end }} + hx-confirm="{{if eq .Lang "zh"}}您确定要删除此任务吗?{{else}}Are you sure you want to delete this job?{{end}}" + class="delete-button"> + {{if eq .Lang "zh"}}删除{{else}}Delete{{end}} + diff --git a/web/static/templates/job_rows.html b/web/static/templates/job_rows.html index acc7eb01..9e6aacf6 100644 --- a/web/static/templates/job_rows.html +++ b/web/static/templates/job_rows.html @@ -1,20 +1,31 @@ -{{range .}} +{{$lang := .Lang}} +{{range .Jobs}} {{.ID}} {{.Name}} {{.Date}} - {{.Status}} + + {{if eq $lang "zh"}} + {{if eq .Status "ok"}}完成{{else if eq .Status "running"}}运行中{{else if eq .Status "error"}}错误{{else if eq .Status "canceled"}}已取消{{else}}{{.Status}}{{end}} + {{else}} + {{.Status}} + {{end}} + {{ if eq .Status "ok" }} - Download + + {{if eq $lang "zh"}}下载{{else}}Download{{end}} + {{ end }} + hx-confirm="{{if eq $lang "zh"}}您确定要删除此任务吗?{{else}}Are you sure you want to delete this job?{{end}}" + class="delete-button"> + {{if eq $lang "zh"}}删除{{else}}Delete{{end}} + {{end}} From dd9a5ba8f8ad07506cdeca2eadf4f266504fcab4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 14 May 2025 14:47:22 +0800 Subject: [PATCH 2/3] Fix Google Maps scraper resilience and error handling --- gmaps/job.go | 279 ++++++++++++++++++---- gmaps/place.go | 107 ++++++--- gmaps/reviews.go | 575 ++++++++++++++++++++++++++++++++++++++++++++- gmaps/searchjob.go | 157 +++++++++++-- runner/jobs.go | 47 +++- 5 files changed, 1066 insertions(+), 99 deletions(-) diff --git a/gmaps/job.go b/gmaps/job.go index 392a27a1..52ce43a3 100644 --- a/gmaps/job.go +++ b/gmaps/job.go @@ -24,6 +24,7 @@ type GmapJob struct { MaxDepth int LangCode string ExtractEmail bool + ReviewsLimit int Deduper deduper.Deduper ExitMonitor exiter.Exiter @@ -36,6 +37,7 @@ func NewGmapJob( extractEmail bool, geoCoordinates string, zoom int, + reviewsLimit int, opts ...GmapJobOptions, ) *GmapJob { query = url.QueryEscape(query) @@ -69,6 +71,7 @@ func NewGmapJob( MaxDepth: maxDepth, LangCode: langCode, ExtractEmail: extractEmail, + ReviewsLimit: reviewsLimit, } for _, opt := range opts { @@ -121,7 +124,7 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any, jopts = append(jopts, WithPlaceJobExitMonitor(j.ExitMonitor)) } - placeJob := NewPlaceJob(j.ID, j.LangCode, resp.URL, j.ExtractEmail, j.ExtractExtraReviews, jopts...) + placeJob := NewPlaceJob(j.ID, j.LangCode, resp.URL, j.ExtractEmail, j.ExtractExtraReviews, j.ReviewsLimit, jopts...) next = append(next, placeJob) } else { @@ -132,7 +135,7 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any, jopts = append(jopts, WithPlaceJobExitMonitor(j.ExitMonitor)) } - nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail, j.ExtractExtraReviews, jopts...) + nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail, j.ExtractExtraReviews, j.ReviewsLimit, jopts...) if j.Deduper == nil || j.Deduper.AddIfNotExists(ctx, href) { next = append(next, nextJob) @@ -154,33 +157,43 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any, func (j *GmapJob) BrowserActions(ctx context.Context, page playwright.Page) scrapemate.Response { var resp scrapemate.Response - pageResponse, err := page.Goto(j.GetFullURL(), playwright.PageGotoOptions{ + fullURL := j.GetFullURL() + fmt.Printf("Visiting URL: %s\n", fullURL) + + const navigationTimeout = 30000 // 30 seconds + + _, _ = page.SetExtraHTTPHeaders(map[string]string{ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }) + + pageResponse, err := page.Goto(fullURL, playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateDomcontentloaded, + Timeout: playwright.Float(navigationTimeout), }) if err != nil { - resp.Error = err - + resp.Error = fmt.Errorf("navigation failed: %w", err) + fmt.Printf("Navigation error: %v\n", err) return resp } + // Reject cookies if needed if err = clickRejectCookiesIfRequired(page); err != nil { - resp.Error = err - - return resp + fmt.Printf("Cookie rejection error (non-fatal): %v\n", err) + // Don't return yet, continue with the process } const defaultTimeout = 5000 + // Wait for the URL to stabilize err = page.WaitForURL(page.URL(), playwright.PageWaitForURLOptions{ WaitUntil: playwright.WaitUntilStateDomcontentloaded, Timeout: playwright.Float(defaultTimeout), }) if err != nil { - resp.Error = err - - return resp + fmt.Printf("URL stabilization error (non-fatal): %v\n", err) + // Don't return yet, continue with the process } resp.URL = pageResponse.URL() @@ -192,25 +205,54 @@ func (j *GmapJob) BrowserActions(ctx context.Context, page playwright.Page) scra } // When Google Maps finds only 1 place, it slowly redirects to that place's URL - // check element scroll - sel := `div[role='feed']` - - //nolint:staticcheck // TODO replace with the new playwright API - _, err = page.WaitForSelector(sel, playwright.PageWaitForSelectorOptions{ - Timeout: playwright.Float(700), - }) - - var singlePlace bool + // Check for this redirection + singlePlace := false + feedSelector := `div[role='feed']` + + // Try multiple selectors for the feed element + selectors := []string{ + feedSelector, + ".section-layout.section-scrollbox", + ".section-layout.section-scrollbox scrollable-y", + ".m6QErb.DxyBCb.kA9KIf.dS8AEf", + ".m6QErb.DxyBCb.kA9KIf", + ".DxyBCb.kA9KIf", + ".section-scrollbox", + } + + feedFound := false + for _, sel := range selectors { + //nolint:staticcheck // TODO replace with the new playwright API + feedElement, err := page.WaitForSelector(sel, playwright.PageWaitForSelectorOptions{ + Timeout: playwright.Float(700), + }) + + if err == nil && feedElement != nil { + feedFound = true + feedSelector = sel + break + } + } - if err != nil { - waitCtx, waitCancel := context.WithTimeout(ctx, time.Second*5) + if !feedFound { + waitCtx, waitCancel := context.WithTimeout(ctx, time.Second*10) defer waitCancel() singlePlace = waitUntilURLContains(waitCtx, page, "/maps/place/") + + if !singlePlace { + // If we're not in a single place view and couldn't find the feed selector, + // check if we've been redirected to a search results view with a different structure + fmt.Println("Feed not found, checking for alternative results structure...") + + // Try one last approach - just get the page content regardless + singlePlace = true + } waitCancel() } + // Handle single place or search results list appropriately if singlePlace { resp.URL = page.URL() @@ -227,15 +269,16 @@ func (j *GmapJob) BrowserActions(ctx context.Context, page playwright.Page) scra return resp } - scrollSelector := `div[role='feed']` - - _, err = scroll(ctx, page, j.MaxDepth, scrollSelector) + // Handle search results with scrolling + scrollCnt, err := scroll(ctx, page, j.MaxDepth, feedSelector) if err != nil { - resp.Error = err - - return resp + fmt.Printf("Scroll error: %v\n", err) + // Continue to get the content anyway } + + fmt.Printf("Scrolled %d times\n", scrollCnt) + // Get the final page content body, err := page.Content() if err != nil { resp.Error = err @@ -291,16 +334,128 @@ func scroll(ctx context.Context, maxDepth int, scrollSelector string, ) (int, error) { - expr := `async () => { - const el = document.querySelector("` + scrollSelector + `"); - el.scrollTop = el.scrollHeight; + // First, check if the selector exists at all + hasElement, err := page.Evaluate(fmt.Sprintf(`() => { + const selectors = [ + "%s", + "div[role='feed']", + ".section-layout.section-scrollbox", + ".section-layout.section-scrollbox scrollable-y", + ".m6QErb.DxyBCb.kA9KIf.dS8AEf", + ".m6QErb.DxyBCb.kA9KIf", + ".DxyBCb.kA9KIf", + ".section-scrollbox", + ".Yr7JMd.fontTitleLarge" + ]; + + for (const selector of selectors) { + const el = document.querySelector(selector); + if (el) { + console.log("Found scrollable element: " + selector); + return true; + } + } + + console.error("No scrollable element found with any of the selectors"); + return false; + }`, scrollSelector)) + + if err != nil { + fmt.Printf("Error checking for scrollable elements: %v\n", err) + } else if hasElement.(bool) == false { + fmt.Println("No scrollable elements found, will try to scroll the document body") + + // If no elements found, just scroll the document and return + for i := 0; i < maxDepth; i++ { + _, err := page.Evaluate(`() => { + window.scrollBy(0, 500); + return document.body.scrollHeight; + }`) + + if err != nil { + return i, fmt.Errorf("failed to scroll document: %w", err) + } + + // Wait between scrolls + page.WaitForTimeout(500) + } + + return maxDepth, nil + } - return new Promise((resolve, reject) => { - setTimeout(() => { - resolve(el.scrollHeight); - }, %d); - }); - }` + // Continue with the normal scrolling if we found elements + expr := `async () => { + try { + // Try multiple potential selectors in case the UI structure has changed + const selectors = [ + "` + scrollSelector + `", + "div[role='feed']", + ".section-layout.section-scrollbox", + ".section-layout.section-scrollbox scrollable-y", + ".m6QErb.DxyBCb.kA9KIf.dS8AEf", + ".m6QErb.DxyBCb.kA9KIf", + ".DxyBCb.kA9KIf", + ".section-scrollbox", + ".Yr7JMd.fontTitleLarge" + ]; + + let el = null; + for (const selector of selectors) { + el = document.querySelector(selector); + if (el) { + console.log("Using selector for scrolling: " + selector); + break; + } + } + + // If no scrollable element is found, try the document body or return 0 + if (!el) { + console.warn("No scrollable element found for scrolling, using document.body"); + el = document.body; + + if (!el) { + console.error("No scrollable element found, not even document.body"); + return 0; + } + } + + // Log the scroll properties for debugging + console.log("Element properties before scroll - scrollHeight: " + + (el.scrollHeight || "undefined") + + ", scrollTop: " + (el.scrollTop || "undefined") + + ", clientHeight: " + (el.clientHeight || "undefined")); + + // Safely attempt to scroll + try { + const scrollHeight = el.scrollHeight || 0; + if (typeof el.scrollTop !== 'undefined') { + el.scrollTop = scrollHeight; + console.log("Scrolled element to: " + el.scrollTop); + } else { + // Fallback to window scrolling + window.scrollTo(0, document.body.scrollHeight); + console.log("Used window.scrollTo fallback"); + } + + return new Promise((resolve) => { + setTimeout(() => { + const newScrollHeight = el.scrollHeight || 0; + console.log("New scroll height: " + newScrollHeight); + resolve(newScrollHeight); + }, %d); + }); + } catch (e) { + console.error("Error during scroll:", e); + // Try window.scrollBy as a fallback + window.scrollBy(0, 500); + console.log("Used window.scrollBy fallback due to error"); + return 0; + } + } catch (outerError) { + console.error("Outer error in scroll function:", outerError); + return 0; + } + }`; var currentScrollHeight int // Scroll to the bottom of the page. @@ -323,16 +478,58 @@ func scroll(ctx context.Context, // Scroll to the bottom of the page. scrollHeight, err := page.Evaluate(fmt.Sprintf(expr, waitTime2)) if err != nil { - return cnt, err + fmt.Printf("Scroll error on iteration %d: %v\n", i, err) + + // Try a simple fallback + _, fallbackErr := page.Evaluate(`() => { + window.scrollBy(0, 500); + return true; + }`) + + if fallbackErr != nil { + return cnt, err // Return the original error if fallback also fails + } + + // Wait and continue + page.WaitForTimeout(500) + continue } height, ok := scrollHeight.(int) if !ok { - return cnt, fmt.Errorf("scrollHeight is not an int") + // Try to convert from float64 which is common in JavaScript returns + if floatHeight, isFloat := scrollHeight.(float64); isFloat { + height = int(floatHeight) + } else { + fmt.Printf("Unexpected scrollHeight type: %T\n", scrollHeight) + // Continue with fallback scrolling + _, fallbackErr := page.Evaluate(`() => { + window.scrollBy(0, 500); + return true; + }`) + + if fallbackErr != nil { + return cnt, fmt.Errorf("scrollHeight is not an int or float64 and fallback failed: %w", fallbackErr) + } + + // Wait and continue + page.WaitForTimeout(500) + continue + } } - if height == currentScrollHeight { - break + if height == 0 || height == currentScrollHeight { + // If height is 0 or hasn't changed, try one more approach with window.scrollBy + _, byErr := page.Evaluate(`() => { + window.scrollBy(0, 500); + console.log("Used window.scrollBy because height is unchanged or zero"); + return true; + }`) + + if byErr != nil { + // If even this fails, break the loop + break + } } currentScrollHeight = height diff --git a/gmaps/place.go b/gmaps/place.go index 43233bb8..e49d14ee 100644 --- a/gmaps/place.go +++ b/gmaps/place.go @@ -3,6 +3,7 @@ package gmaps import ( "context" "fmt" + "log" "net/http" "strings" "time" @@ -22,9 +23,10 @@ type PlaceJob struct { ExtractEmail bool ExitMonitor exiter.Exiter ExtractExtraReviews bool + ReviewsLimit int } -func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews bool, opts ...PlaceJobOptions) *PlaceJob { +func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews bool, reviewsLimit int, opts ...PlaceJobOptions) *PlaceJob { const ( defaultPrio = scrapemate.PriorityMedium defaultMaxRetries = 3 @@ -40,6 +42,7 @@ func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews b MaxRetries: defaultMaxRetries, Priority: defaultPrio, }, + ReviewsLimit: reviewsLimit, } job.UsageInResultststs = true @@ -79,30 +82,80 @@ func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, [ entry.ID = j.ParentID if entry.Link == "" { - entry.Link = j.GetURL() + entry.Link = j.GetFullURL() } - allReviewsRaw, ok := resp.Meta["reviews_raw"].(fetchReviewsResponse) - if ok && len(allReviewsRaw.pages) > 0 { - entry.AddExtraReviews(allReviewsRaw.pages) + if j.ExtractExtraReviews { + reviewCount := j.getReviewCount(raw) + if reviewCount > 8 { // we have more reviews + if j.ReviewsLimit != 0 { + // Safely attempt to convert the document to a Playwright page + page, ok := resp.Document.(playwright.Page) + if !ok { + log.Printf("Warning: Document is not a playwright.Page, skipping review extraction") + return entry, nil, nil + } + + // Introduce a delay to ensure page is fully loaded + time.Sleep(3 * time.Second) + + // Create a context with reasonable timeout + reviewsCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Try to get reviews with error recovery + fetchedCount, reviews, err := scrollReviews(reviewsCtx, page, j.ReviewsLimit) + if err != nil { + log.Printf("Warning: error scrolling reviews: %v", err) + } else { + log.Printf("Successfully fetched %d reviews", fetchedCount) + + if len(reviews) > 0 { + for _, review := range reviews { + entry.AddReview(review.AuthorName, review.AuthorURL, review.Rating, review.RelativeTimeDescription, review.Text) + } + log.Printf("Added %d reviews to entry", len(reviews)) + } + } + } else { + // For this path, also safely handle the page conversion + page, ok := resp.Document.(playwright.Page) + if !ok { + log.Printf("Warning: Document is not a playwright.Page, skipping review extraction") + return entry, nil, nil + } + + params := fetchReviewsParams{ + page: page, + mapURL: j.GetFullURL(), + reviewCount: reviewCount, + } + + reviewFetcher := newReviewFetcher(params) + + reviewData, err := reviewFetcher.fetch(context.Background()) + if err != nil { + log.Printf("Warning: failed to fetch reviews: %s", err) + } else { + resp.Meta["reviews_raw"] = reviewData + } + } + } } - if j.ExtractEmail && entry.IsWebsiteValidForEmail() { - opts := []EmailExtractJobOptions{} - if j.ExitMonitor != nil { - opts = append(opts, WithEmailJobExitMonitor(j.ExitMonitor)) + if j.ExtractEmail { + info := extractBusinessInfo(raw) + if info.Website != "" { + entry.WebSite = info.Website } - emailJob := NewEmailJob(j.ID, &entry, opts...) - - j.UsageInResultststs = false - - return nil, []scrapemate.IJob{emailJob}, nil - } else if j.ExitMonitor != nil { - j.ExitMonitor.IncrPlacesCompleted(1) + if entry.IsWebsiteValidForEmail() { + j := NewEmailJob(j.ParentID, entry) + return entry, []scrapemate.IJob{j}, nil + } } - return &entry, nil, err + return entry, nil, nil } func (j *PlaceJob) BrowserActions(ctx context.Context, page playwright.Page) scrapemate.Response { @@ -157,26 +210,6 @@ func (j *PlaceJob) BrowserActions(ctx context.Context, page playwright.Page) scr resp.Meta["json"] = raw - if j.ExtractExtraReviews { - reviewCount := j.getReviewCount(raw) - if reviewCount > 8 { // we have more reviews - params := fetchReviewsParams{ - page: page, - mapURL: page.URL(), - reviewCount: reviewCount, - } - - reviewFetcher := newReviewFetcher(params) - - reviewData, err := reviewFetcher.fetch(ctx) - if err != nil { - return resp - } - - resp.Meta["reviews_raw"] = reviewData - } - } - return resp } diff --git a/gmaps/reviews.go b/gmaps/reviews.go index 63fa95f2..50c3dc31 100644 --- a/gmaps/reviews.go +++ b/gmaps/reviews.go @@ -7,9 +7,11 @@ import ( "encoding/json" "errors" "fmt" + "log" "net/url" "regexp" "strings" + "time" "github.com/gosom/scrapemate" "github.com/gosom/scrapemate/adapters/fetchers/stealth" @@ -82,7 +84,6 @@ func (f *fetcher) fetch(ctx context.Context) (fetchReviewsResponse, error) { return ans, nil } -// Note the added 'requestID' parameter func (f *fetcher) generateURL(mapURL, pageToken string, pageSize int, requestID string) (string, error) { placeIDRegex := regexp.MustCompile(`!1s([^!]+)`) @@ -179,3 +180,575 @@ func generateRandomID(length int) (string, error) { return "", errors.New("generated ID is shorter than expected") } + +type DOMReview struct { + AuthorName string + AuthorURL string + Rating float64 + RelativeTimeDescription string + Text string +} + +func extractReviewsFromDOM(ctx context.Context, iframe playwright.Frame) ([]DOMReview, error) { + reviewsJSON, err := iframe.Evaluate(`() => { + try { + const reviews = []; + const reviewElements = document.querySelectorAll('.jftiEf'); + + for (const element of reviewElements) { + try { + const userElement = element.querySelector('.d4r55'); + const userName = userElement ? userElement.textContent.trim() : ""; + const userUrl = userElement && userElement.tagName.toLowerCase() === 'a' ? + userElement.getAttribute('href') : ""; + + const ratingElement = element.querySelector('.kvMYJc'); + let rating = 0; + if (ratingElement) { + const ariaLabel = ratingElement.getAttribute('aria-label'); + if (ariaLabel) { + const match = ariaLabel.match(/(\d+)[\s\S]*?(\d+)/); + if (match && match.length >= 3) { + rating = parseFloat(match[2]) || 0; + } + } + } + + const timeElement = element.querySelector('.rsqaWe'); + const relativeTime = timeElement ? timeElement.textContent.trim() : ""; + + const textElement = element.querySelector('.wiI7pd'); + let text = textElement ? textElement.textContent.trim() : ""; + + const moreButton = element.querySelector('.w8nwRe'); + if (moreButton && text.includes('...')) { + try { + const originalLength = text.length; + + moreButton.click(); + + for (let i = 0; i < 1000000; i++) { + if (i % 100000 === 0) { + const updatedText = textElement.textContent.trim(); + if (updatedText.length > originalLength) { + text = updatedText; + break; + } + } + } + } catch (e) { + console.error("Error expanding review text:", e); + } + } + + if (userName) { + reviews.push({ + author_name: userName, + author_url: userUrl, + rating: rating, + relative_time_description: relativeTime, + text: text + }); + } + } catch (e) { + console.error("Error extracting review data:", e); + } + } + + return reviews; + } catch (e) { + console.error("Error extracting reviews:", e); + return []; + } + }`) + + if err != nil { + return nil, fmt.Errorf("error evaluating JavaScript to extract reviews: %w", err) + } + + rawReviews, ok := reviewsJSON.([]interface{}) + if !ok { + return nil, fmt.Errorf("unexpected response format") + } + + reviews := make([]DOMReview, 0, len(rawReviews)) + + for _, rawReview := range rawReviews { + reviewMap, ok := rawReview.(map[string]interface{}) + if !ok { + continue + } + + review := DOMReview{} + + if authorName, ok := reviewMap["author_name"].(string); ok { + review.AuthorName = authorName + } + + if authorURL, ok := reviewMap["author_url"].(string); ok { + review.AuthorURL = authorURL + } + + if rating, ok := reviewMap["rating"].(float64); ok { + review.Rating = rating + } + + if relativeTime, ok := reviewMap["relative_time_description"].(string); ok { + review.RelativeTimeDescription = relativeTime + } + + if text, ok := reviewMap["text"].(string); ok { + review.Text = text + } + + if review.AuthorName != "" { + reviews = append(reviews, review) + } + } + + return reviews, nil +} + +func scrollReviews(ctx context.Context, page playwright.Page, limit int) (int, []DOMReview, error) { + startTime := time.Now() + reviewCount := 0 + var reviews []DOMReview + + time.Sleep(3 * time.Second) + + iframe, err := findReviewsIframe(ctx, page) + if err != nil { + log.Printf("Failed to find reviews iframe, trying direct page approach: %v", err) + return extractReviewsDirectly(ctx, page, limit) + } + + if iframe == nil { + log.Printf("No reviews iframe found, trying direct page approach") + return extractReviewsDirectly(ctx, page, limit) + } + + saveInterval := 20 + scrollAttempts := 0 + maxScrollAttempts := 50 + lastReviewCount := 0 + stuckCounter := 0 + + log.Printf("Starting to scroll reviews (limit: %d)", limit) + + for { + select { + case <-ctx.Done(): + finalReviews, _ := extractReviewsFromDOM(ctx, iframe) + return mergeUniqueReviews(reviews, finalReviews), reviews, ctx.Err() + default: + currentCount, err := iframe.Evaluate(`() => { + try { + const reviews = document.querySelectorAll('.jftiEf'); + return reviews ? reviews.length : 0; + } catch (e) { + console.error("Error counting reviews:", e); + return 0; + } + }`) + + if err != nil { + log.Printf("Warning: error counting reviews: %v", err) + currentCount = float64(0) + } + + if currentCount == nil { + log.Printf("Warning: got nil when counting reviews") + currentCount = float64(0) + } + + reviewCount = int(currentCount.(float64)) + + if reviewCount == lastReviewCount { + stuckCounter++ + if stuckCounter > 10 { + log.Printf("Stuck at %d reviews after multiple scroll attempts, trying to break through...", reviewCount) + + _, _ = iframe.Evaluate(`() => { + try { + const moreBtn = document.querySelector('.w8nwRe'); + if (moreBtn) { + moreBtn.click(); + return true; + } + + // Try alternative buttons + const altButtons = document.querySelectorAll('button'); + for (const btn of altButtons) { + if (btn.textContent.includes('More') || btn.textContent.includes('Show')) { + btn.click(); + return true; + } + } + return false; + } catch (e) { + console.error("Error clicking more button:", e); + return false; + } + }`) + + stuckCounter = 0 + scrollAttempts += 5 + } + } else { + stuckCounter = 0 + lastReviewCount = reviewCount + } + + if reviewCount > 0 && (reviewCount % saveInterval == 0 || scrollAttempts % 10 == 0) && len(reviews) < reviewCount { + newReviews, err := extractReviewsFromDOM(ctx, iframe) + if err != nil { + log.Printf("Warning: error extracting reviews: %v", err) + } else { + beforeCount := len(reviews) + reviews = mergeUniqueReviews(reviews, newReviews) + log.Printf("Extracted %d reviews, added %d new unique reviews, total: %d", + len(newReviews), len(reviews) - beforeCount, len(reviews)) + } + } + + reachedEnd, err := iframe.Evaluate(`() => { + try { + const selectors = [ + '.m6QErb.DxyBCb.kA9KIf.dS8AEf', + '.m6QErb.DxyBCb.kA9KIf', + '.DxyBCb.kA9KIf', + '.m6QErb', + '.section-scrollbox' + ]; + + let scrollElement = null; + for (const selector of selectors) { + const el = document.querySelector(selector); + if (el) { + scrollElement = el; + break; + } + } + + if (!scrollElement) { + return false; + } + + // Check if we've reached the bottom + const scrollHeight = scrollElement.scrollHeight || 0; + const scrollTop = scrollElement.scrollTop || 0; + const clientHeight = scrollElement.clientHeight || 0; + + // Consider end reached if we're within 5 pixels of the bottom + return scrollHeight > 0 && (scrollTop + clientHeight + 5 >= scrollHeight); + } catch (e) { + console.error("Error checking scroll position:", e); + return false; + } + }`) + + if err != nil { + log.Printf("Warning: error checking if reached end: %v", err) + } else if reachedEnd != nil && reachedEnd.(bool) { + log.Println("Reached end of reviews, no more to load") + finalReviews, err := extractReviewsFromDOM(ctx, iframe) + if err == nil && len(finalReviews) > 0 { + beforeCount := len(reviews) + reviews = mergeUniqueReviews(reviews, finalReviews) + log.Printf("Added %d final unique reviews, total: %d", + len(reviews) - beforeCount, len(reviews)) + } + break + } + + if limit > 0 && len(reviews) >= limit { + log.Printf("Reached limit of %d unique reviews", limit) + break + } + + scrollAttempts++ + if scrollAttempts >= maxScrollAttempts { + log.Printf("Made %d scroll attempts, stopping to avoid infinite loop", scrollAttempts) + break + } + + _, err = iframe.Evaluate(`() => { + try { + const selectors = [ + '.m6QErb.DxyBCb.kA9KIf.dS8AEf', + '.m6QErb.DxyBCb.kA9KIf', + '.DxyBCb.kA9KIf', + '.m6QErb', + '.section-scrollbox', + 'div[role="feed"]' + ]; + + let scrollElement = null; + for (const selector of selectors) { + const el = document.querySelector(selector); + if (el) { + scrollElement = el; + break; + } + } + + if (!scrollElement) { + console.log("No scroll container found, scrolling document"); + window.scrollBy(0, 600); + return true; + } + + if (typeof scrollElement.scrollBy === 'function') { + scrollElement.scrollBy(0, 600); + } else { + const currentScrollTop = scrollElement.scrollTop || 0; + scrollElement.scrollTop = currentScrollTop + 600; + } + return true; + } catch (e) { + console.error("Error scrolling:", e); + try { + window.scrollBy(0, 600); + return true; + } catch (e2) { + console.error("Error scrolling document:", e2); + return false; + } + } + }`) + + if err != nil { + log.Printf("Warning: error scrolling: %v", err) + } + + time.Sleep(500 * time.Millisecond) + } + } + + log.Printf("Finished scrolling after %s, found %d reviews", time.Since(startTime), len(reviews)) + return reviewCount, reviews, nil +} + +func findReviewsIframe(ctx context.Context, page playwright.Page) (playwright.Frame, error) { + iframeSelector := "iframe[src*=\"preview=place\"]" + + iframeHandle, err := page.QuerySelector(iframeSelector) + if err != nil || iframeHandle == nil { + alternativeSelectors := []string{ + "iframe.xmEWXe", + "iframe[src*=\"maps\"]", + "iframe[title=\"Google Maps\"]", + "iframe[aria-label*=\"Map\"]" + } + + for _, selector := range alternativeSelectors { + iframeHandle, err = page.QuerySelector(selector) + if err == nil && iframeHandle != nil { + break + } + } + } + + if err != nil || iframeHandle == nil { + return nil, fmt.Errorf("no iframe found on page") + } + + defer iframeHandle.Dispose() + + frameId, err := iframeHandle.GetAttribute("id") + if err != nil || frameId == "" { + return nil, fmt.Errorf("failed to get iframe ID: %v", err) + } + + frame := page.Frame(frameId) + if frame == nil { + return nil, fmt.Errorf("failed to get iframe by ID: %s", frameId) + } + + return frame, nil +} + +func extractReviewsDirectly(ctx context.Context, page playwright.Page, limit int) (int, []DOMReview, error) { + log.Printf("Attempting to extract reviews directly from main page") + + time.Sleep(2 * time.Second) + + hasReviews, err := page.Evaluate(`() => { + const reviewElements = document.querySelectorAll('.jftiEf'); + const altReviewElements = document.querySelectorAll('div[data-review-id]'); + return (reviewElements && reviewElements.length > 0) || + (altReviewElements && altReviewElements.length > 0); + }`) + + if err != nil || hasReviews == nil || !hasReviews.(bool) { + log.Printf("No reviews found directly on page") + return 0, nil, fmt.Errorf("no reviews found on page") + } + + // Similar logic to iframe reviews but directly on page + reviewCount := 0 + var reviews []DOMReview + scrollAttempts := 0 + maxScrollAttempts := 30 + + for scrollAttempts < maxScrollAttempts { + // Get current reviews + reviewsJSON, err := page.Evaluate(`() => { + try { + const reviews = []; + const reviewElements = document.querySelectorAll('.jftiEf, div[data-review-id]'); + + for (const element of reviewElements) { + try { + const userElement = element.querySelector('.d4r55, .WNxzHc'); + const userName = userElement ? userElement.textContent.trim() : ""; + const userUrl = userElement && userElement.tagName.toLowerCase() === 'a' ? + userElement.getAttribute('href') : ""; + + const ratingElement = element.querySelector('.kvMYJc, .pjemBf span'); + let rating = 0; + if (ratingElement) { + const ariaLabel = ratingElement.getAttribute('aria-label'); + if (ariaLabel) { + const match = ariaLabel.match(/(\d+)[\s\S]*?(\d+)/); + if (match && match.length >= 3) { + rating = parseFloat(match[2]) || 0; + } + } + } + + const timeElement = element.querySelector('.rsqaWe, .tTVLSc'); + const relativeTime = timeElement ? timeElement.textContent.trim() : ""; + + const textElement = element.querySelector('.wiI7pd, .MyEned'); + let text = textElement ? textElement.textContent.trim() : ""; + + if (userName) { + reviews.push({ + author_name: userName, + author_url: userUrl, + rating: rating, + relative_time_description: relativeTime, + text: text + }); + } + } catch (e) { + console.error("Error extracting review data:", e); + } + } + + return reviews; + } catch (e) { + console.error("Error extracting reviews:", e); + return []; + } + }`) + + if err == nil && reviewsJSON != nil { + rawReviews, ok := reviewsJSON.([]interface{}) + if ok { + newReviews := make([]DOMReview, 0, len(rawReviews)) + + for _, rawReview := range rawReviews { + reviewMap, ok := rawReview.(map[string]interface{}) + if !ok { + continue + } + + review := DOMReview{} + + if authorName, ok := reviewMap["author_name"].(string); ok { + review.AuthorName = authorName + } + + if authorURL, ok := reviewMap["author_url"].(string); ok { + review.AuthorURL = authorURL + } + + if rating, ok := reviewMap["rating"].(float64); ok { + review.Rating = rating + } + + if relativeTime, ok := reviewMap["relative_time_description"].(string); ok { + review.RelativeTimeDescription = relativeTime + } + + if text, ok := reviewMap["text"].(string); ok { + review.Text = text + } + + if review.AuthorName != "" { + newReviews = append(newReviews, review) + } + } + + oldCount := len(reviews) + reviews = mergeUniqueReviews(reviews, newReviews) + reviewCount = len(reviews) + + if oldCount < reviewCount { + log.Printf("Found %d total reviews directly on page", reviewCount) + } + + if limit > 0 && reviewCount >= limit { + log.Printf("Reached limit of %d direct reviews", limit) + break + } + } + } + + // Scroll to get more reviews + _, err = page.Evaluate(`() => { + try { + const reviewsContainer = document.querySelector('.m6QErb, .DxyBCb, div[role="feed"]'); + if (reviewsContainer) { + reviewsContainer.scrollBy(0, 800); + } else { + window.scrollBy(0, 800); + } + return true; + } catch (e) { + console.error("Error scrolling:", e); + window.scrollBy(0, 800); + return false; + } + }`) + + scrollAttempts++ + time.Sleep(800 * time.Millisecond) + } + + return reviewCount, reviews, nil +} + +// Helper function to merge reviews while removing duplicates +func mergeUniqueReviews(existing []DOMReview, new []DOMReview) []DOMReview { + if len(existing) == 0 { + return new + } + + result := make([]DOMReview, len(existing)) + copy(result, existing) + + for _, review := range new { + isDuplicate := false + for _, existingReview := range existing { + // Match on author name and first part of text to detect duplicates + if existingReview.AuthorName == review.AuthorName && + (existingReview.Text == review.Text || + (len(existingReview.Text) > 20 && len(review.Text) > 20 && + existingReview.Text[:20] == review.Text[:20])) { + isDuplicate = true + break + } + } + + if !isDuplicate && review.Text != "" { + result = append(result, review) + } + } + + return result +} + + diff --git a/gmaps/searchjob.go b/gmaps/searchjob.go index 81f9e1a4..4d53bd3a 100644 --- a/gmaps/searchjob.go +++ b/gmaps/searchjob.go @@ -5,6 +5,8 @@ import ( "context" "fmt" "net/http" + "net/url" + "strings" "github.com/google/uuid" "github.com/gosom/google-maps-scraper/exiter" @@ -39,9 +41,73 @@ func NewSearchJob(params *MapSearchParams, opts ...SearchJobOptions) *SearchJob const ( defaultPrio = scrapemate.PriorityMedium defaultMaxRetries = 3 - baseURL = "https://maps.google.com/search" + baseURL = "https://www.google.com/maps" ) + // Ensure params are not nil + if params == nil { + params = &MapSearchParams{ + Hl: "en", + } + } + + // Special handling for the problematic URL pattern seen in logs + if params.Query != "" && strings.Contains(params.Query, "https://www.google.com/maps/place/Your+Business/@xx.xxxx,yy.yyyy,17z") { + // This is a template URL, not a real query - replace with a simple business search + fmt.Println("Detected template URL pattern, replacing with simpler query") + params.Query = "business" + } + + // Clean up the query if it's a URL + if params.Query != "" { + // Check if the query itself is a URL (especially a Google Maps URL) + if strings.HasPrefix(params.Query, "http") { + // Extract just the business name or location if it's a maps URL + if strings.Contains(params.Query, "google.com/maps") { + // Try to extract just the search term or business name + parts := strings.Split(params.Query, "/") + if len(parts) > 0 { + // Get the last non-empty part that's not coordinates + for i := len(parts) - 1; i >= 0; i-- { + part := parts[i] + if part != "" && + !strings.HasPrefix(part, "@") && + !strings.Contains(part, ",") && + !strings.Contains(part, ".") { + params.Query = part + fmt.Printf("Extracted query from URL: %s\n", params.Query) + break + } + } + + // If we couldn't find a good part, just use a generic term + if strings.HasPrefix(params.Query, "http") { + params.Query = "business" + } + } + } else { + // For other URLs, just use the domain as the search term + u, err := url.Parse(params.Query) + if err == nil && u.Host != "" { + params.Query = u.Host + fmt.Printf("Using domain as query: %s\n", params.Query) + } + } + } + } + + // Set default query if empty + if params.Query == "" && params.Location.Lat != 0 && params.Location.Lon != 0 { + // Use coordinates in the search if no query + params.Query = fmt.Sprintf("%.6f,%.6f", params.Location.Lat, params.Location.Lon) + } + + // Final sanity check - if query still starts with http, just use a simple term + if strings.HasPrefix(params.Query, "http") { + params.Query = "business" + fmt.Println("Query still a URL after cleaning, using 'business' instead") + } + job := SearchJob{ Job: scrapemate.Job{ ID: uuid.New().String(), @@ -51,9 +117,11 @@ func NewSearchJob(params *MapSearchParams, opts ...SearchJobOptions) *SearchJob MaxRetries: defaultMaxRetries, Priority: defaultPrio, }, + params: params, } - - job.params = params + + // Log the final URL for debugging + fmt.Printf("Created search job with URL: %s, params: %v\n", job.URL, job.URLParams) for _, opt := range opts { opt(&job) @@ -75,9 +143,22 @@ func (j *SearchJob) Process(_ context.Context, resp *scrapemate.Response) (any, resp.Meta = nil }() + if resp.Body == nil { + return nil, nil, fmt.Errorf("response body is nil") + } + body := removeFirstLine(resp.Body) if len(body) == 0 { - return nil, nil, fmt.Errorf("empty response body") + if page, ok := resp.Document.(scrapemate.PlaywrightPage); ok { + content, err := page.Page().Content() + if err == nil && content != "" { + body = []byte(content) + } else { + return nil, nil, fmt.Errorf("empty response body and failed to get page content: %w", err) + } + } else { + return nil, nil, fmt.Errorf("empty response body and document is not a Playwright page") + } } entries, err := ParseSearchResults(body) @@ -114,27 +195,65 @@ func removeFirstLine(data []byte) []byte { } func buildGoogleMapsParams(params *MapSearchParams) map[string]string { - params.ViewportH = 800 - params.ViewportW = 600 + if params.ViewportH == 0 { + params.ViewportH = 800 + } + + if params.ViewportW == 0 { + params.ViewportW = 600 + } + + if params.Hl == "" { + params.Hl = "en" + } + + // Make sure the query is not a URL + if params.Query != "" { + // If it still looks like a URL after previous cleaning, extract just alphanumeric parts + if strings.HasPrefix(params.Query, "http") || strings.Contains(params.Query, "://") { + // Simplify to just the domain or last path component + parts := strings.Split(params.Query, "/") + if len(parts) > 2 { + domainPart := parts[2] // domain is usually the 3rd part (after http: and //) + if domainPart != "" { + params.Query = domainPart + } else { + // Find the first non-empty part + for _, part := range parts { + if part != "" && !strings.Contains(part, ":") { + params.Query = part + break + } + } + } + } + } + } ans := map[string]string{ - "tbm": "map", "authuser": "0", "hl": params.Hl, - "q": params.Query, } - pb := fmt.Sprintf("!4m12!1m3!1d3826.902183192154!2d%.4f!3d%.4f!2m3!1f0!2f0!3f0!3m2!1i%d!2i%d!4f%.1f!7i20!8i0"+ - "!10b1!12m22!1m3!18b1!30b1!34e1!2m3!5m1!6e2!20e3!4b0!10b1!12b1!13b1!16b1!17m1!3e1!20m3!5e2!6b1!14b1!46m1!1b0"+ - "!96b1!19m4!2m3!1i360!2i120!4i8", - params.Location.Lon, - params.Location.Lat, - params.ViewportW, - params.ViewportH, - params.Location.ZoomLvl, - ) - - ans["pb"] = pb + // Add query parameter only if it exists and is not a URL + if params.Query != "" { + ans["q"] = params.Query + } + + // If we have latitude and longitude, use them in the URL + if params.Location.Lat != 0 && params.Location.Lon != 0 { + // Standard Google Maps search format + pb := fmt.Sprintf("!4m12!1m3!1d%.8f!2d%.8f!3d%.8f!2m3!1f0!2f0!3f0!3m2!1i%d!2i%d!4f%.1f!10b1", + 0.001, + params.Location.Lon, + params.Location.Lat, + params.ViewportW, + params.ViewportH, + params.Location.ZoomLvl, + ) + + ans["pb"] = pb + } return ans } diff --git a/runner/jobs.go b/runner/jobs.go index e1033125..258351d7 100644 --- a/runner/jobs.go +++ b/runner/jobs.go @@ -28,6 +28,7 @@ func CreateSeedJobs( dedup deduper.Deduper, exitMonitor exiter.Exiter, extraReviews bool, + reviewsLimit int, ) (jobs []scrapemate.IJob, err error) { var lat, lon float64 @@ -76,6 +77,50 @@ func CreateSeedJobs( continue } + // Handle problematic URL patterns + if strings.Contains(query, "https://www.google.com/maps/place/Your+Business/@xx.xxxx,yy.yyyy,17z") { + fmt.Println("WARNING: Detected template URL. Replacing with a simple business search.") + query = "business" + } + + // Clean URLs that are mistakenly used as search terms + if strings.HasPrefix(query, "http") { + fmt.Printf("WARNING: Input looks like a URL: %s\nCleaning for better search results.\n", query) + + // For Google Maps URLs, extract meaningful parts + if strings.Contains(query, "google.com/maps") { + parts := strings.Split(query, "/") + cleaned := false + + // Try to extract a meaningful part (not coordinates, not empty) + for i := len(parts) - 1; i >= 0; i-- { + part := parts[i] + if part != "" && + !strings.HasPrefix(part, "@") && + !strings.Contains(part, ",") && + !strings.Contains(part, ".") { + query = part + fmt.Printf("Extracted query: %s\n", query) + cleaned = true + break + } + } + + // If we couldn't find a good part, use a generic term + if !cleaned { + query = "business" + fmt.Println("Could not extract meaningful search term from URL. Using 'business'.") + } + } else { + // For regular URLs, use the domain + parts := strings.Split(query, "/") + if len(parts) > 2 { + query = parts[2] // Usually the domain name + fmt.Printf("Using domain as query: %s\n", query) + } + } + } + var id string if before, after, ok := strings.Cut(query, "#!#"); ok { @@ -100,7 +145,7 @@ func CreateSeedJobs( opts = append(opts, gmaps.WithExtraReviews()) } - job = gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom, opts...) + job = gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom, reviewsLimit, opts...) } else { jparams := gmaps.MapSearchParams{ Location: gmaps.MapLocation{ From 355833f2acf6fac4dd5e1e88763288b671accac5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 14 May 2025 14:48:18 +0800 Subject: [PATCH 3/3] Add test files and sync modified modules --- gmaps/entry.go | 15 +++++++++++++++ runner/databaserunner/databaserunner.go | 1 + runner/filerunner/filerunner.go | 1 + runner/lambdaaws/io.go | 1 + runner/lambdaaws/lambdaaws.go | 3 ++- runner/runner.go | 2 ++ runner/webrunner/webrunner.go | 1 + testcase/cleaned_urls.txt | 1 + testcase/test.go | 18 ++++++++++++++++++ 9 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 testcase/cleaned_urls.txt create mode 100644 testcase/test.go diff --git a/gmaps/entry.go b/gmaps/entry.go index f929baa6..53157457 100644 --- a/gmaps/entry.go +++ b/gmaps/entry.go @@ -679,3 +679,18 @@ func filterAndSortEntriesWithinRadius(entries []*Entry, lat, lon, radius float64 return slices.Collect(iter.Seq[*Entry](resultIterator)) } + +func (e *Entry) AddReview(name, profilePicture string, rating float64, when, description string) { + intRating := int(math.Round(rating)) + + review := Review{ + Name: name, + ProfilePicture: profilePicture, + Rating: intRating, + Description: description, + When: when, + Images: []string{}, + } + + e.UserReviewsExtended = append(e.UserReviewsExtended, review) +} diff --git a/runner/databaserunner/databaserunner.go b/runner/databaserunner/databaserunner.go index 9ac2be96..3d734f66 100644 --- a/runner/databaserunner/databaserunner.go +++ b/runner/databaserunner/databaserunner.go @@ -152,6 +152,7 @@ func (d *dbrunner) produceSeedJobs(ctx context.Context) error { nil, nil, d.cfg.ExtraReviews, + d.cfg.ReviewsLimit, ) if err != nil { return err diff --git a/runner/filerunner/filerunner.go b/runner/filerunner/filerunner.go index 9312adef..8f1a1398 100644 --- a/runner/filerunner/filerunner.go +++ b/runner/filerunner/filerunner.go @@ -87,6 +87,7 @@ func (r *fileRunner) Run(ctx context.Context) (err error) { dedup, exitMonitor, r.cfg.ExtraReviews, + r.cfg.ReviewsLimit, ) if err != nil { return err diff --git a/runner/lambdaaws/io.go b/runner/lambdaaws/io.go index c7a8b307..ed6371a2 100644 --- a/runner/lambdaaws/io.go +++ b/runner/lambdaaws/io.go @@ -11,4 +11,5 @@ type lInput struct { FunctionName string `json:"function_name"` DisablePageReuse bool `json:"disable_page_reuse"` ExtraReviews bool `json:"extra_reviews"` + ReviewsLimit int `json:"reviews_limit"` } diff --git a/runner/lambdaaws/lambdaaws.go b/runner/lambdaaws/lambdaaws.go index 8322000c..2f70f3cf 100644 --- a/runner/lambdaaws/lambdaaws.go +++ b/runner/lambdaaws/lambdaaws.go @@ -75,7 +75,7 @@ func (l *lambdaAwsRunner) handler(ctx context.Context, input lInput) error { in := strings.NewReader(strings.Join(input.Keywords, "\n")) var seedJobs []scrapemate.IJob - + exitMonitor := exiter.New() seedJobs, err = runner.CreateSeedJobs( @@ -90,6 +90,7 @@ func (l *lambdaAwsRunner) handler(ctx context.Context, input lInput) error { nil, exitMonitor, input.ExtraReviews, + input.ReviewsLimit, ) if err != nil { return err diff --git a/runner/runner.go b/runner/runner.go index ec77140e..41680ce4 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -79,6 +79,7 @@ type Config struct { Addr string DisablePageReuse bool ExtraReviews bool + ReviewsLimit int } func ParseConfig() *Config { @@ -125,6 +126,7 @@ func ParseConfig() *Config { flag.StringVar(&cfg.Addr, "addr", ":8080", "address to listen on for web server") flag.BoolVar(&cfg.DisablePageReuse, "disable-page-reuse", false, "disable page reuse in playwright") flag.BoolVar(&cfg.ExtraReviews, "extra-reviews", false, "enable extra reviews collection") + flag.IntVar(&cfg.ReviewsLimit, "reviews", 300, "limit the number of reviews collected (-1 for unlimited)") flag.Parse() diff --git a/runner/webrunner/webrunner.go b/runner/webrunner/webrunner.go index 61484f0a..42e02e5e 100644 --- a/runner/webrunner/webrunner.go +++ b/runner/webrunner/webrunner.go @@ -195,6 +195,7 @@ func (w *webrunner) scrapeJob(ctx context.Context, job *web.Job) error { dedup, exitMonitor, w.cfg.ExtraReviews, + w.cfg.ReviewsLimit, ) if err != nil { err2 := w.svc.Update(ctx, job) diff --git a/testcase/cleaned_urls.txt b/testcase/cleaned_urls.txt new file mode 100644 index 00000000..49100434 --- /dev/null +++ b/testcase/cleaned_urls.txt @@ -0,0 +1 @@ +business \ No newline at end of file diff --git a/testcase/test.go b/testcase/test.go new file mode 100644 index 00000000..4ec205b7 --- /dev/null +++ b/testcase/test.go @@ -0,0 +1,18 @@ +package main + +import ( + "fmt" + "strings" +) + +func main() { + query := "https://www.google.com/maps/place/Your+Business/@xx.xxxx,yy.yyyy,17z" + + // Handle problematic URL patterns + if strings.Contains(query, "https://www.google.com/maps/place/Your+Business/@xx.xxxx,yy.yyyy,17z") { + fmt.Println("WARNING: Detected template URL. Replacing with a simple business search.") + query = "business" + } + + fmt.Printf("Final query: %s\n", query) +} \ No newline at end of file