node+phantomjs+cheerio 实现爬虫(爬取百度
2018-07-02 本文已影响0人
lovelydong
1.安装 phantomjs
下载
http://phantomjs.org/download.html
解压配环境变量
npm i phantomjs
2.安装 cheerio
使用管理员身份运行cmd
npm install -g cheerio
3.node 代码
pa.js
const phantom = require('phantom')
const cheerio = require('cheerio')
const request = require('request')
const fs = require('fs')
function delay(second) {
return new Promise((resolve) => {
setTimeout(resolve, second * 1000);
});
}
let url = 'http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%CF%C4%C4%BF%D3%D1%C8%CB%D5%CA%B1%DA%D6%BD1080&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=000000'
function save(url) {
let ext = url.split('.').pop()
request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
}
(async function() {
let instance = await phantom.create();
let page = await instance.createPage();
let status = await page.open(url);
let size = await page.property('viewportSize', {
width: 1920,
height: 1080
})
let $
async function pageScroll(i) {
await delay(1)
await page.property('scrollPosition', {
left: 0,
top: 1000 * i
})
let content = await page.property('content')
$ = cheerio.load(content)
if($('.imgbox').length < 200) {
await pageScroll(++i)
}
}
await pageScroll(0)
let urlList = []
$('.imgbox').each(function() {
urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
})
async function imgSave(i) {
let status = await page.open(urlList[i])
await delay(1)
let content = await page.property('content')
$ = cheerio.load(content)
let src = $('#currentImg').attr('src')
save(src)
if(i<urlList.length) {
await imgSave(++i)
}
}
await imgSave(0)
await instance.exit()
}());
4.执行目录创建image 文件夹(存放爬的图片)
5.在此处执行cmd
node pa.js
然后等着图片一个个出来吧 滑稽