Python text
2019-11-27 本文已影响0人
laod_wh
最近突然想看看电子书,然后想看的那本书呀,还得下app或者在线看,太麻烦,于是写了个简单抓取:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
import http.cookiejar # import cookielib
import os
import io
import random
import urllib
import uuid
import bs4
import datetime
import requests
import sys
import time
import re
from config_info import *
from bs4 import BeautifulSoup as BS
import pymysql
import lxml.html
import re
import gzip
etree = lxml.html.etree
# 浏览器代理
agents = [
#Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
#chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
#UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
#IPhone
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
#IPod
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
#IPAD
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
#Android
"Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
]
# 使用代理防止被封
proxies = {}
# 解析html页面 soup
def getSoup(page):
headers = {
# 随机取请求头
"User-Agent": random.choice(agents),
"Host": "www.xuehong.cc",
"Referer": "https://www.xuehong.cc/book/9133/"
}
req = urllib.request.urlopen(page)
req.addheaders = [headers]
html = req.read().decode('utf-8', errors='ignore')
soup = BS(html, 'html.parser')
# sleep for several secs
randint_data = random.randint(2, 5)
if randint_data < 2:
randint_data = 2
time.sleep(randint_data)
return soup
def main():
try:
print('------------start---------------');
tager_ = range(31445761, 36622777,1)
for i in tager_:
url_ = "https://www.xuehong.cc/book/9133/"+str(i)+".html"
print("current page:"+" ---> "+url_)
txt_file = open("/Volumes/LaCie/mac/python/book.txt", "a", encoding="utf-8") # 以写的格式打开先打开文件
page_soup = getSoup(url_) # 解析网页
tmp = page_soup.find('div', id='content').text.strip();
bookname = page_soup.find('div', class_='bookname')
bookname = bookname.find('h1').text.strip();
#print(tmp)
txt_file.write(bookname)
txt_file.write("\n")
txt_file.write(tmp)
txt_file.write("\n")
txt_file.close()
print('------------success end---------------');
except Exception as e:
print('------------Exception !!!---------------');
print(e);
if __name__ == "__main__":
main()
运行日志:
pydev debugger: process 12236 is connecting
Connected to pydev debugger (build 171.4694.70)
------------start---------------
current page: ---> https://www.xuehong.cc/book/9133/31445761.html
current page: ---> https://www.xuehong.cc/book/9133/31445762.html
current page: ---> https://www.xuehong.cc/book/9133/31445763.html
current page: ---> https://www.xuehong.cc/book/9133/31445764.html
current page: ---> https://www.xuehong.cc/book/9133/31445765.html
current page: ---> https://www.xuehong.cc/book/9133/31445766.html
current page: ---> https://www.xuehong.cc/book/9133/31445767.html
current page: ---> https://www.xuehong.cc/book/9133/31445768.html
current page: ---> https://www.xuehong.cc/book/9133/31445769.html
运行文件:
