麻瓜编程·python实战·1-5作业:爬58转转

2016-08-10  本文已影响0人  bbjoe

我的成果

运行结果

我的代码

from bs4 import  BeautifulSoup
import requests, time, random, json

# 换代理,参考现成的:https://mugglecoding.gitbooks.io/qa/content/ru_he_huo_qu_dai_li_ip.html
resp = requests.get("http://tor1024.com/static/proxy_pool.txt")
ips_txt = resp.text.strip().split("\n")
ips = []
for i in ips_txt:
    try:
        k = json.loads(i)
        ips.append(k)
    except Exception as e:
        print(e)

# 58搜索页面
urls = ['http://bj.58.com/pbdn/0/pn{}/'.format(str(i)) for i in range(1,4)]
headers = {'headers':'Mozilla/5.0 (Windows NT 6.1; WOW64) \ '
                     'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}

# 从转转页面爬取二手商品信息
def zhuanzhuan_data(url):
    # 列表用来整理商品标签(商品vs标签:1对多)
    total_tags = []
    try:
        web_data = requests.get(url, headers=headers, proxies=random.choice(ips),timeout=6)
        soup = BeautifulSoup(web_data.text, 'lxml')
        category = soup.select('div[class="breadCrumb f12"] \> \ '
                               'span:nth-of-type(4) > a')[0].get_text().lstrip()
        subject = soup.select('h1[class="info_titile"]')[0].get_text()
        page_view = soup.select('span[class="look_time"]')[0].get_text()
        price = soup.select('span[class="price_now"] > i')[0].get_text()
        region = soup.select('div[class="palce_li"] > span > i')[0].get_text()
        tags = soup.select('div[class="biaoqian_li"] > span')

        # 标签需要单独整理一下
        for tag in tags:
            tag = tag.get_text()
            total_tags.append(tag)

        # 汇总信息到一个词典
        data = {
            'category':category,
            'title':subject,
            'view':page_view,
            'price':price,
            'region':region,
            'tags':total_tags
        }

        # 打印词典,把词典传入“写入txt”的函数
        print(data)
        create_txt(data)

    except Exception as e:
        print(e)

def create_txt(data):
    f = open(r'c:/users/administrator/desktop/zz.txt', 'a')
    f.write(str(data) + '\n' + '-'*70 + '\n')
    f.close()

# 从58搜索页面获取二手商品链接
def get_zz(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text, 'lxml')
    zz_urls = soup.select('tr.zzinfo > td.img > a')

    for zz_url in zz_urls:
        print('这是第',zz_urls.index(zz_url) + 1,'条转转')
        zz_url = zz_url.get('href')
        zhuanzhuan_data(zz_url)

# 【#####起点#####】
for url in urls:
    print('#####第',urls.index(url) + 1,'页#####')
    print('-'*60)
    get_zz(url)
    print('-'*60)

我的感想:

上一篇 下一篇

猜你喜欢

热点阅读