python抓取亚马逊商品信息

2023-06-26 本文已影响0人追梦人在路上不断追寻

python抓取网页数据，主要是以下几个步骤。

安装所需的 Python 包 python -m pip install requests beautifulsoup4
设置请求头部，防止被拦截。
分析商品页面信息，得到数据的构造结构。
通过选择器获取到需要的数据。
进行抓取调试。
抓取数据，进行保存。

下面是主要的抓取代码，安装好想要的扩展包之后，可以直接运行，进行调试。

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

custom_headers = {
    "accept-language": "en-GB,en;q=0.9",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
}

def get_product_info(url):
    response = requests.get(url, headers=custom_headers)
    if response.status_code != 200:
        print("Error in getting webpage")
        exit(-1)

    soup = BeautifulSoup(response.text, "lxml")

    title_element = soup.select_one("#productTitle")
    title = title_element.text.strip() if title_element else None

    price_element = soup.select_one("#price_inside_buybox")
    price = price_element.text if price_element else None

    rating_element = soup.select_one("#acrPopover")
    rating_text = rating_element.attrs.get("title") if rating_element else None
    rating = rating_text.replace("out of 5 stars", "") if rating_text else None

    image_element = soup.select_one("#landingImage")
    image = image_element.attrs.get("src") if image_element else None

    description_element = soup.select_one("#productDescription")
    description = description_element.text.strip() if description_element else None

    return {
        "title": title,
        "price": price,
        "rating": rating,
        "image": image,
        "description": description,
        "url": url,
    }

def parse_listing(listing_url):

    response = requests.get(listing_url, headers=custom_headers)
    soup_search = BeautifulSoup(response.text, "lxml")
    link_elements = soup_search.select("[data-asin] h2 a")
    page_data = []
    for link in link_elements:
        full_url = urljoin(listing_url, link.attrs.get("href"))
        print(f"Scraping product from {full_url[:100]}", flush=True)
        product_info = get_product_info(full_url)
        page_data.append(product_info)

    next_page_el = soup_search.select_one('a:contains("Next")')
    if next_page_el:
        next_page_url = next_page_el.attrs.get('href')
        next_page_url = urljoin(listing_url, next_page_url)
        print(f'Scraping next page: {next_page_url}', flush=True)
        page_data += parse_listing(next_page_url)

    return page_data

def main():
    data = []
    search_url = "https://www.amazon.com/s?k=bose&rh=n%3A12097479011&ref=nb_sb_noss"
    data = parse_listing(search_url)
    df = pd.DataFrame(data)
    df.to_csv('amz.csv')

if __name__ == '__main__':
    main()

python抓取亚马逊商品信息

猜你喜欢

热点阅读