python_58同城二手车信息采集
2019-04-23 本文已影响0人
盛夏光年ing
本篇是针对 58 同城二手车的爬虫,主要是爬取车的价格,一些基础信息,保存到 CSV表格中。
创建时间:2019-04-22 10:20 很简单还是分享一下吧
import re
import math
import requests
from scrapy import Selector
def start_request():
"""
开始爬虫
:return:
"""
index_url = 'https://quanguo.58.com/ershouche/'
index_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://quanguo.58.com/ershouche/',
}
index_response = requests.get(url=index_url, headers=index_headers)
if index_response.status_code == 200:
selector_response = Selector(text=index_response.text)
all_info = selector_response.xpath('//tr')
all_car_info_total = selector_response.xpath('//p[@id="infocont"]/strong/text()').extract_first()
all_page = math.ceil(int(all_car_info_total) / 50)
print('has %s total_page' % all_page)
get_car_info(all_info) # 进行第一页信息的解析
for page_ in range(2, int(all_page)):
print('开始下载第 %s 页图片' % page_)
page_url = 'https://quanguo.58.com/ershouche/pn%s/' % str(page_)
page_response = requests.get(url=page_url, headers=index_headers)
if index_response.status_code == 200:
selector_response = Selector(text=page_response.text)
all_info = selector_response.xpath('//tr')
get_car_info(all_info) # 进行第下一页信息的解析
def get_car_info(all_info):
"""
解析获得的信息
:param all_info:
:return:
"""
for each_info in all_info[1:]:
car_info = each_info.xpath('td[2]/a//text()').extract()
car_log = car_info[0] if len(car_info) >= 2 else '' # 车的标志 eg:现代、大众、日产等标志
car_model = car_info[1] if len(car_info) >= 2 else '' # 车的型号 eg: 索纳塔 2011款 2.0L 自动尊贵版
base_car_info = each_info.xpath('td[2]/p//text()').extract() # 车的一些基础信息
buy_year = re.findall(r'.*\t(\w+)\t', base_car_info[0])[0] # 购买的年限
travelling_kilometers = re.findall(r'(.*)\t', base_car_info[2])[0] # 已经行驶公里数
displacement = re.findall(r'(.*)\t', base_car_info[4])[0] # 汽车的排量是多少升
car_type = re.findall(r'(.*)\t', base_car_info[6])[0] # 汽车是自动挡还是手动挡的
car_price = each_info.xpath('td[3]/b/text()').extract_first() # 汽车的价格
car_price = car_price + '万元' if car_price else ''
car_safety = each_info.xpath('td[4]//a/text()').extract_first() # 行驶证是否验证
list_info = [car_log, car_model, buy_year, travelling_kilometers, displacement, car_type, car_price, car_safety]
all_write = ','.join('%s' % each_ for each_ in list_info)
with open('car_info.csv', 'a+', encoding='utf-8') as f:
print('正在写入中................')
f.write(all_write + '\n')
if __name__ == '__main__':
start_request()
很简答的一次整理,欢迎查看个人 csdn账号:https://blog.csdn.net/weixin_42812527