attachmentsking/main.py at main · justinmstevenson/attachmentsking · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
from bs4 import BeautifulSoup
# from time import sleep
from pprint import pprint
import csv
import re
import os
from urllib.parse import urljoin

product_links = []

def clean_data(text):
    text = text.replace("\n", " ").replace("\r", "")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Loop through the pages from 1 to 41
for page_number in range(1, 42):
    print(f"Parsing page {page_number}")
    url = f"https://attachmentsking.com/collections/all?page={page_number}&grid_list=grid-view"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    for link in soup.find_all("a", href=True):
        if "/product" in link["href"]:
            # Add the link to the list if it's not already present
            full_link = f"https://attachmentsking.com{link['href']}"
            if full_link not in product_links:
                product_links.append(full_link)
# Count the number of unique product URLs
number_of_products = len(product_links)
#for link in product_links:
    #print(link)
print(f"Number of unique product URLs: {number_of_products}")

product_details = {}
counter = 0
images_directory = "images"

if not os.path.exists(images_directory):
    os.makedirs(images_directory)

headers = ['SKU', 'Images', 'Title', 'Vendor', 'Vendor URL', 'Weight', 'Description', 'Rating', 'Shipping & Returns']
with open("product_details.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)

for link in product_links:
    counter += 1
    print(f"Product # {counter}")
    # Initialize a dictionary to store the product details
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    try:
        sku = soup.find("div", id="productSKU").text.strip()
        product_details["SKU"] = sku.replace("SKU: ", "")
    except:
        product_details["SKU"] = ""

    sku_directory = os.path.join(images_directory, product_details["SKU"])
    if not os.path.exists(sku_directory):
        os.makedirs(sku_directory)

    thumbnails = soup.select("#productThumbnails img")
    product_images = []
    for img in thumbnails:
        src = img["src"]
        # Remove everything past .jpg in the src attribute
        clean_src = re.sub(r"\?.*$", "", src)
        full_url = urljoin(url, clean_src)
        image_filename = os.path.basename(clean_src)
        image_path = os.path.join(sku_directory, image_filename)
        # Download and save the image
        with open(image_path, "wb") as f:
            f.write(requests.get(full_url).content)

        product_images.append(src)
    product_details["Images"] = product_images

    # Parse other product details
    try:
        product_details["Title"] = (
            soup.find("div", id="productTitle").text.strip()
            if soup.find("div", id="productTitle")
            else ""
        )
    except:
        product_details["Title"] = ""
    try:
        vendor_tag = soup.find("a", id="productVendor")
        product_details["Vendor"] = clean_data(
            vendor_tag.text.replace("Vendor: ", "")
            .replace("by: ").strip()
        )
    except:
        product_details["Vendor"] = ""

    try:
        product_details["Vendor URL"] = vendor_tag["href"]
    except:
        product_details["Vendor URL"] = ""
    try:
        product_details["Weight"] = clean_data(
            soup.find("div", id="productWeight")
            .text.replace("Weight:  ", "")
            .strip()
        )
    except:
        product_details["Weight"] = ""

    try:
        product_details["Description"] = soup.find(
            "div", id="productDescription"
        ).text.strip()
    except AttributeError:
        product_details["Description"] = ""

    try:
        rating_tag = soup.find("div", class_="loox-rating")
        product_details["Rating"] = rating_tag["title"] if rating_tag else ""
    except:
        product_details["Rating"] = ""

    try:
        shipping_returns = " ".join(p.text for p in soup.select(".productTabContent p"))
    except:
        shipping_returns = ""

    product_details["Shipping & Returns"] = shipping_returns
    pprint(product_details)
    with open("product_details.csv", "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=product_details.keys())
        writer.writerow(product_details)

print("Product details saved to product_details.csv")