2019-08-05 《python爬虫开发:从入门到实战》

2019-08-05  本文已影响0人  半睡的猫

1.正则表达式

学习了简单的正则表达式使用。通过这个书发现,实际爬虫过程中,用正则表达式去匹配内容,并不是非常常用。

xpath更好用一点。

2.简单网页爬虫开发

2.1 用python获取网页源码

request库

2.2 多线程编码的例子


import re

import requests

import os

from multiprocessing.dummy import Pool

def get_toc(html):

    toc_url_list = []

    toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0] # re.S作用是忽略换行符

    toc_url = re.findall('href="(.*?)"', toc_block, re.S)

    for url in toc_url:

        toc_url_list.append(start_url + url)

    return toc_url_list

def get_article(html):

    chapter_name = re.search('size="4">(.*?)<', html, re.S).group(1) # group里面的1,代表第一个括号匹配的内容

    text_block = re.search('<p>(.*?)</p>', html, re.S).group(1)

    text_block = text_block.replace('<br />', ' ')

    return chapter_name, text_block

def save_file(chapter, article):

    os.makedirs('动物庄园', exist_ok=True)

    with open(os.path.join('动物庄园', chapter+'.txt'), 'w', encoding='utf-8') as f: # 文本文件操作

        f.write(article)

def chapter_get_save(chapter_url):

    html_chapter = requests.get(chapter_url).content.decode('GB2312')

    chapter, article = get_article(html_chapter)

    print("get"+chapter+"ok")

    save_file(chapter, article)

    print("save"+chapter+"ok")

# 主程序部分

start_url = 'http://www.kanunu8.com/book3/6879/'

print("begin:")

html_str = requests.get(start_url).content.decode('GB2312')

print("get start html ok")

chapter_url_list = get_toc(html_str)

print("get chapter url ok")

pool = Pool(5)

pool.map(chapter_get_save, chapter_url_list)

print("end!")

3.高性能内容解析

xpath和beautifulsoup4

from bs4 import BeautifulSoup
import requests

target_url = 'http://exercise.kingname.info/exercise_bs_1.html'

print('begin get html')
html_str = requests.get(target_url).content.decode('utf-8')
print('end get html')
soup = BeautifulSoup(html_str, 'html.parser')
info = soup.find(class_ = 'test')
print(info.string)

info2 = soup.find(class_ = 'useful')
all_content = info2.find_all('li')
for li in all_content:
    print(li.string)

4.数据库

mongoDB

redis

上一篇 下一篇

猜你喜欢

热点阅读