Python爬虫入门

2017-02-28  本文已影响0人  Yuu_CX

获取图片并存入文件夹中

import urllib.request
response = urllib.request.urlopen('http://placekitten.com/1920/1280')
cat_img = response.read()
with open('cat_1920_1280.jpg','wb')as f:
    f.write(cat_img)

利用有道翻译

# -*- coding:utf-8 -*- 
import urllib.request
import urllib.parse
import json

content = input("请输入要翻译的内容:")
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'

head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 

data = {}
data['type']='AUTO'
data['i']=content
data['doctype']='json'
data['xmlVersion']='1.8'
data['keyfrom']='fanyi.web'
data['ue']='UTF-8'
data['action']='FY_BY_CLICKBUTTON'
data['typoResult']='true'
data = urllib.parse.urlencode(data).encode('utf_8')

req = urllib.request.Request(url,data,head)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')

target = json.loads(html)
print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))

Python爬虫将煎蛋网上的图片全部下载到本地

# -*- coding:utf-8 -*- 
import urllib.request
import os

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html
    
def get_page(url):
    html = url_open(url).decode('utf-8')
    a = html.find('current-comment-page')+23
    b = html.find(']',a)#从a位置开始找到位置坐标
    return html[a:b]#页码
    
def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_address = []
    a = html.find('img src=')
    while a!=-1:
        b = html.find('.jpg',a,a+255)#从a开始,到限定结束范围a+255
        if b != -1:
            img_address.append('http:'+html[a+9:b+4])
        else:
            b = a+9
        a = html.find('img src=',b)  
    return img_address

def save_imgs(folder,img_address):
    for each in img_address:
        filename = each.split('/')[-1] #取最后一个即图片名
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)
            
def download_mm(folder = 'ooxx',pages=10):
    os.mkdir(folder)
    os.chdir(folder)
    
    url = "http://jandan.net/ooxx"
    page_num = int(get_page(url))
    
    for i in range(pages):
        page_num -= i
        page_url = url+'/page-'+str(page_num)+'#comments'
        img_address = find_imgs(page_url)
        save_imgs(folder,img_address)
        
if __name__ =='__main__':
    download_mm()

Python爬虫将贴吧上的图片全部下载到本地

# -*- coding:utf-8 -*- 、
import urllib.request
import re

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html

def get_img(html):
    p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
    imglist = re.findall(p, str(html))
    for each in imglist:
        filename = each.split("/")[-1]
        urllib.request.urlretrieve(each, filename, None)
if __name__ == '__main__':
    url = 'http://tieba.baidu.com/p/3563409202'
    get_img(url_open(url))

爬豆瓣电影TOP250,参考

import pymysql
import requests
from bs4 import BeautifulSoup


#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句,%s用作字符串占位
            sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
    db.close()

将豆瓣爬下来的电影详情按年份、国家或地区、类型等分好并写入MySQL数据库

import pymysql
import requests
from bs4 import BeautifulSoup
import re

#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists


if __name__ == "__main__":
     # 连接数据库,需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        time VARCHAR(300) NOT NULL,
        country VARCHAR(300) NOT NULL,
        type VARCHAR(300) NOT NULL,
        drictor_artist VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        data=[]
        for i in lists:
             action = i["detail"]
             remove=re.compile(r'                            |\n|</br>|\.*')
             bd=re.sub(remove,"",action)
             bd=re.sub('<br>',"   ",bd)#去掉<br>
             bd=re.sub('/',"   ",bd)#替换/
             words=bd.split("   ")
             for s in words:
                  if len(s)!=0 and s!=' ':#去掉空白内容
                        data.append(s)
             i["time"] = data[-3][-5:]
             i["country"] = data[-2]
             i["type"] = data[-1]
             i["drictor_artist"] = data[0]
             # 插入数据到数据库sql语句,%s用作字符串占位
             sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
             try:
                 cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
                 db.commit()
                 print(i["name"]+" is success")
             except:
                 db.rollback()
    db.close()

可以将TOP250电影的年份画出来

豆瓣电影TOP250年代分布
上一篇下一篇

猜你喜欢

热点阅读