python_58同城二手车信息采集

2019-04-23  本文已影响0人  盛夏光年ing

本篇是针对 58 同城二手车的爬虫,主要是爬取车的价格,一些基础信息,保存到 CSV表格中。

创建时间:2019-04-22 10:20 很简单还是分享一下吧

import re
import math
import requests
from scrapy import Selector


def start_request():
    """
    开始爬虫
    :return:
    """
    index_url = 'https://quanguo.58.com/ershouche/'
    index_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'https://quanguo.58.com/ershouche/',
    }
    index_response = requests.get(url=index_url, headers=index_headers)
    if index_response.status_code == 200:
        selector_response = Selector(text=index_response.text)
        all_info = selector_response.xpath('//tr')

        all_car_info_total = selector_response.xpath('//p[@id="infocont"]/strong/text()').extract_first()
        all_page = math.ceil(int(all_car_info_total) / 50)
        print('has %s total_page' % all_page)
        get_car_info(all_info)  # 进行第一页信息的解析
        for page_ in range(2, int(all_page)):
            print('开始下载第 %s 页图片' % page_)
            page_url = 'https://quanguo.58.com/ershouche/pn%s/' % str(page_)
            page_response = requests.get(url=page_url, headers=index_headers)
            if index_response.status_code == 200:
                selector_response = Selector(text=page_response.text)
                all_info = selector_response.xpath('//tr')
                get_car_info(all_info)  # 进行第下一页信息的解析


def get_car_info(all_info):
    """
    解析获得的信息
    :param all_info:
    :return:
    """
    for each_info in all_info[1:]:
        car_info = each_info.xpath('td[2]/a//text()').extract()
        car_log = car_info[0] if len(car_info) >= 2 else ''  # 车的标志 eg:现代、大众、日产等标志
        car_model = car_info[1] if len(car_info) >= 2 else ''  # 车的型号 eg: 索纳塔 2011款 2.0L 自动尊贵版
        base_car_info = each_info.xpath('td[2]/p//text()').extract()  # 车的一些基础信息
        buy_year = re.findall(r'.*\t(\w+)\t', base_car_info[0])[0]  # 购买的年限
        travelling_kilometers = re.findall(r'(.*)\t', base_car_info[2])[0]  # 已经行驶公里数
        displacement = re.findall(r'(.*)\t', base_car_info[4])[0]  # 汽车的排量是多少升
        car_type = re.findall(r'(.*)\t', base_car_info[6])[0]  # 汽车是自动挡还是手动挡的
        car_price = each_info.xpath('td[3]/b/text()').extract_first()  # 汽车的价格
        car_price = car_price + '万元' if car_price else ''
        car_safety = each_info.xpath('td[4]//a/text()').extract_first()  # 行驶证是否验证
        list_info = [car_log, car_model, buy_year, travelling_kilometers, displacement, car_type, car_price, car_safety]
        all_write = ','.join('%s' % each_ for each_ in list_info)
        with open('car_info.csv', 'a+', encoding='utf-8') as f:
            print('正在写入中................')
            f.write(all_write + '\n')


if __name__ == '__main__':
    start_request()

很简答的一次整理,欢迎查看个人 csdn账号:https://blog.csdn.net/weixin_42812527

上一篇下一篇

猜你喜欢

热点阅读