杂类爬取香江百货 硬存数据库
2019-03-12 本文已影响0人
Meter_bulacn
import requests
import re
import json
from lxml import etree
import urllib.parse
import urllib
import pymysql,random,time
conn=pymysql.connect(host='127.0.0.1',user='root',password='bc123',db='leshop',charset='utf8')
cur=conn.cursor()
header = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
def qingqiu(url):
response = requests.get(url,headers=header)
response.encoding = 'utf-8'
# print(response.status_code)
a= etree.HTML(response.text)
b = a.xpath('//*[@id="pcUL"]/dl[1]/dd/div/ul')
# print(b)
for i in b:
jiexi = i.xpath('.//li[1]/p')
for w in jiexi:
ww = w.xpath('.//a/text()')
# print(ww)
qq = w.xpath('.//a/@href')
for e in qq:
url = urllib.parse.urljoin(response.url,e)
# print(url)
yuedxq(url)
# yield yuedxq(url)
def yuedxq(url):
# print(url)
response = requests.get(url,headers=header)
# print(response)
response.encoding = 'utf-8'
# print(response.status_code)
a= etree.HTML(response.text)
v = a.xpath('//*[@id="5"]/a[2]/@title')
for rr in v:
ee = rr
d = a.xpath('//*[@id="listShowStyleBody"]/li/div[3]/div[1]/p/em/text()')
for y in d:
wr = y
# print(d)
b = a.xpath('//*[@id="listShowStyleBody"]/li/div[2]/h4/a/@title')
for qw in b:
we = qw
# print(we)
c = a.xpath('//*[@id="listShowStyleBody"]/li/div[1]/a/img/@src')
# a=random.randint(0,1000)
# name = models.CharField(magth=100, verbose_name="商品名")
# sql='''insert into goods_goods(goods_sn,name,click_num,sold_num,fav_num,goods_num,market_price,shop_price,goods_brief,goods_desc,ship_free,is_new,is_hot,add_time,category_id) values(0,%s,0,0,0,0,0,0,0,0,True,False,False,%s,%s)'''
# s=time.time()
sql='''insert into goods_goodscategory(id,name,code,`desc`,category_type,is_tab,add_time,parent_category_id) values(0,%s,1,0,3,1,%s,1)'''
s= time.strftime('%Y-%m-%d',time.localtime(time.time()))
ss=random.randint(20,106)
# cur.execute(sql,(ee,s,ss))
cur.execute(sql,(ee,s))
conn.commit()
if __name__ == '__main__':
url = "http://xjbh.net/index.html"
qingqiu(url)