爬虫常用代码
2019-01-02 本文已影响0人
探索1者
xpath抓取指定贴吧所有的图片
# 思路
# 1.获取贴吧主页URL,下一页,找URL规律
# 2.获取 1 页中每个帖子的URL
# 3.对每个帖子 URL 发请求,获取帖子中 图片URL
# 4.以此对图片URL发请求,以 wb 方式保存到本地
# 帖子链接链表 = parseHtml.xpath('..')
# for 1个帖子链接 in 帖子链接列表:
# html = 对每个帖子发请求得到响应
# for 1个图片链接 in 图片链接列表
# with open('ll.jpg', 'wb') as f:
# f.write()
# //div[@class="t_con cleafix"]/div/div/div/a/@href
# //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src
# kw=%E6%A0%A1%E8%8A%B1&pn=100
from lxml import etree
import requests
import urllib.parse
class BaiduImgSpider:
def __init__(self):
self.baseurl = 'http://tieba.baidu.com'
self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
self.mainurl = 'http://tieba.baidu.com/f?'
# 获取所有帖子的 URL 列表
def getPageUrl(self, params):
# 发请求
res = requests.get(self.mainurl,params=params,headers=self.headers)
res.encoding = 'utf-8'
html = res.text
# 提取页面中的href
parseHtml = etree.HTML(html)
tList = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for t in tList:
tLink = self.baseurl + t
self.getImgUrl(tLink)
# 获取 一个 帖子中所有图片的URL列表
def getImgUrl(self, tLink):
# 获取一个贴子的响应内容
res = requests.get(tLink,headers=self.headers)
res.encoding = 'utf-8'
html = res.text
# 提取图片的 src
parseHtml = etree.HTML(html)
imgList = parseHtml.xpath('//div[@class="video_src_wrapper"]/embed/@data-video | //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src')
# 依次遍历图片链接调用写入函数
for img in imgList:
self.writeImage(img)
# 把图片保存到本地
def writeImage(self, img):
# 对图片链接发起请求,获取res.content
res = requests.get(img, headers=self.headers)
res.encoding = 'utf-8'
# 二进制文件
html = res.content
# 写入本地文件
filename = img[-12:]
with open(filename, 'wb') as f:
f.write(html)
print("%s下载成功" % filename)
# 主函数
def workOn(self):
name = input('输入要爬取的贴吧名称:')
begin = int(input("起始页:"))
end = int(input("终止页:"))
for n in range(begin, end+1):
pn = (n-1) * 50
params = {
"kw": name,
"pn": pn
}
self.getPageUrl(params)
# params = urllib.parse.urlencode(params)
# # 拼接URL
# url = self.baseurl + "/f?" + params
# self.getPageUrl(url)
if __name__ == "__main__":
spider = BaiduImgSpider()
spider.workOn()
xpath爬取糗事百科的文字
import requests
from lxml import etree
import pymongo
class QiushSpider:
def __init__(self):
self.url = "https://www.qiushibaike.com/text/"
self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
# 连接对象
self.conn = pymongo.MongoClient("10.8.20.56",27017)
# 库对象
self.db = self.conn["Qiushidb"]
# 集合对象
self.myset = self.db["zhuanye"]
# 获取页面
def getPage(self):
res = requests.get(self.url, headers=self.headers)
res.encoding = 'utf-8'
html = res.text
self.parsePage(html)
# 解析并写入数据库
def parsePage(self, html):
# 创建解析对象,也是 节点对象
parseHtml = etree.HTML(html)
# 利用解析对象调用xpath,每个段子的对象
baseList = parseHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
# 遍历每个段子对象,1个1个提取
for base in baseList:
# base: <element at ...> 节点对象
# 用户昵称
username = base.xpath('./div/a/h2')
if username:
username = username[0].text
else:
username = '匿名用户'
# 段子内容
content = base.xpath('./a/div[@class="content"]/span/text()')
content = "".join(content).strip()
# 好笑的数量
laugh = base.xpath('.//i[@class="number"]')[0].text
# 评论的数量
comments = base.xpath('.//i[@class="number"]')[1].text
# 存入 mongo 数据库,先定义成字典
d = {
"username": username.strip(),
"content": content.strip(),
"laugh": laugh.strip(),
"comments": comments.strip()
}
self.myset.insert_one(d)
# 主函数
def workOn(self):
print('正在爬取中...')
self.getPage()
print('爬取结束,存入Qiushidb库')
if __name__ == '__main__':
spider = QiushSpider()
spider.workOn()
requests模块案例 : 豆瓣电影排行榜数据抓取
抓取目标 : 豆瓣电影 - 排行榜 - 剧情电影名称 评分
import requests
import json
import pymysql
class DoubanSpider:
def __init__(self):
self.url = "https://movie.douban.com/j/chart/top_list?"
self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
self.db = pymysql.connect("10.8.20.56","kk","123456","spiderdb",charset="utf8")
self.cursor = self.db.cursor()
# 获取页面
def getPage(self, params):
res = requests.get(self.url,params=params,headers=self.headers)
res.encoding = "utf-8"
html = res.text
# print(html)
# html 为[{1个电影信息},{},{}]
self.parsePage(html)
# 解析页面
def parsePage(self, html):
ins = "insert into douban(name,score) values(%s,%s)"
rList = json.loads(html)
for rDict in rList:
name = rDict["title"]
score = rDict["score"]
L = [name.strip(),float(score.strip())]
print(L)
# print(name, score)
self.cursor.execute(ins,L)
self.db.commit()
print("插入成功")
self.cursor.close()
self.db.close()
# 主函数
def workOn(self):
number = input("请输入数量:")
params = {
"type": 11,
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": number
}
self.getPage(params)
if __name__ == '__main__':
spider = DoubanSpider()
spider.workOn()
selenium+chromedriver京东商品信息爬取
from selenium import webdriver
import time
# 创建浏览器对象
driver = webdriver.Chrome()
# 访问京东首页
driver.get('https://www.jd.com/')
# 找到搜索框按钮,接收终端输入,发送到搜索框
text = driver.find_element_by_class_name('text')
key = input("请输入要搜索的内容:")
text.send_keys(key)
# 点击 搜索按钮
button = driver.find_element_by_class_name('button')
button.click()
time.sleep(2)
while True:
# 执行脚本,进度条拉到最底部
driver.execute_script(
'window.scrollTo(0,document.body.\
scrollHeight)')
time.sleep(3)
# 提取数据,分析数据
rList = driver.find_elements_by_xpath(
'//div[@id="J_goodsList"]//li')
# rList : ['商品1节点对象','商品2节点对象']
for r in rList:
contentList = r.text.split('\n')
price = contentList[0]
name = contentList[1]
commit = contentList[2]
market = contentList[3]
d = {
"价格":price,
"名称":name,
"评论":commit,
"商家":market,
}
with open("jd.json","a",encoding="utf-8") as f:
f.write(str(d) + '\n')
# 点击下一页,-1表示没找到
if driver.page_source.find(
'pn-next disabled') == -1:
driver.find_element_by_class_name\
('pn-next').click()
time.sleep(3)
else:
print("爬取结束")
break
#下一页能点 : pn-next
#下一页不能点: pn-next disabled
# 关闭浏览器
driver.quit()