爬虫基础
http1.1 http://files.blogjava.net/sunchaojin/http1.3.pdf
1.查看网页源码 Chrome页面Ctrl + U、F12
2.使用pycharm创建网页文件 源码包括:images文件夹、css文件、html文件
3.安装库lxml、BeautifulSoup4、Requests
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
http://beautifulsoup.readthedocs.io/zh_CN/latest/
http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
response 成功 status_code:200
from bs4 import BeautifulSoup
import requests
import time
urls = ['http://www.duzhe.com/index.php?v=listing&cid=38&page={}'.format(str(i))for i in range(1,9)]
def get_list(url,data=None):
wb_data = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('#con_warp > div > div > div.left_p > ul > li > div.con_top > h3 > a')
likes = soup.select('#con_warp > div > div > div.left_p > ul > li > div.icons_warp > a:nth-of-type(1)')
for title,like in zip(titles,likes):
data = {
'title' :title.get_text(),
'like' :like.get_text()
}
print(data)
for single_url in urls:
get_list(single_url)