5-案例:淘宝商品页爬取
2018-08-10 本文已影响2人
撸撸很乖张
import requests
import re
import pymysql
import time
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\":\"[\d.]*\"', html)
tlt = re.findall(r'\"raw_title\":\".*?\"', html)
for i in range(len(plt)):
# price = plt[i].split(':')[1].strip('"')
# title = tlt[i].split(':')[1].strip('"')
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")
def deposit(ilt, cursor, db):
try:
for item in ilt:
sql = "INSERT INTO `computer` (`title`, `price`) VALUES (%s, %s)"
cursor.execute(sql, (item[1], item[0]))
db.commit()
except:
db.rollback()
def main():
db = pymysql.connect('127.0.0.1', 'root', '123456', 'spider')
db.set_charset('utf8')
cursor = db.cursor()
goods = '台式机'
depth = 99
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44 * i)
html = getHTMLText(url)
parsePage(infoList, html)
deposit(infoList, cursor, db)
infoList = []
except:
continue
time.sleep(3)
print(i)
db.close()
if __name__ == '__main__':
main()