python简单爬虫实现

2017-11-15  本文已影响19人  大道至简_Andy

功能点

  1. 维护待爬取的url列表
  2. url去重
  3. 提取想要的url
#!/user/bin/env python
# coding=utf-8

import urllib2
import re
import urlparse


def download(url, num_reties=2):
    print 'downloading:', url

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    try:
        request = urllib2.Request(url, headers=headers)
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'download error:', e.reason
        html = None
        if num_reties > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, num_reties - 1)
    return html


def link_crawler(seed_url, link_regex):
    # 待爬取的列表
    crawl_queue = [seed_url]
    # url去重
    seen_url = set(crawl_queue)

    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            if re.match(link_regex, link):
                base_url = 'http://bbs.mumayi.com'
                link = urlparse.urljoin(base_url, link)
                if link not in seen_url:
                    print 'link=', link
                    crawl_queue.append(link)


def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    return webpage_regex.findall(html)


# 'thread-6339929-1-1.html'
if __name__ == '__main__':
    link_crawler("http://bbs.mumayi.com/forum-8-1.html", 'thread-[0-9]{1,}-1-1.html')
上一篇下一篇

猜你喜欢

热点阅读