在子进程中启动爬虫脚本

2018-08-22  本文已影响0人  noyanse

cp.js

const cp = require('child_process')
const { resolve } = require('path')

;(async () => {
    const script = resolve(__dirname,'./crawler1')
    const child = cp.fork(script, [])
    let invoked = false
    child.on('error', err => {
        if(invoked) return true
        invoked = true
        console.log(err)
    })
    child.on('exit', code => {
        if(invoked) return
        invoked = true
        let err = code === 0 ? null : new Error('exit code' + code)

        console.log(err)
    })
    child.on('message', data => {
        let result = data.result
        console.log(result)
    })
})()

crawler.js

const url = `https://movie.douban.com/tag/#/?sort=R&range=6,10&tags=`

const puppeteer = require('puppeteer')


const sleep = time => new Promise(resolve => {
  setTimeout(resolve, time)
})

;(async () => {
  console.log('Start visit the target page')

  const browser = await puppeteer.launch({
    args: ['--no-sandbox']
  })

  const page = await browser.newPage()
  await page.goto(url, {
    waitUntil: 'networkidle2'
  })

  await sleep(3000)

  await page.waitForSelector('.more')

  for (let i = 0; i < 1; i++) {
    await sleep(3000)
    await page.click('.more')
  }

  const result = await page.evaluate(() => {
    var $ = window.$
    var items = $('.list-wp a')
    var links = []

    if (items.length >= 1) {
      items.each((index, item) => {
        let it = $(item)
        let doubanId = it.find('div').data('id')
        let title = it.find('.title').text()
        let rate = Number(it.find('.rate').text())
        let poster = it.find('img').attr('src').replace('s_ratio', 'l_ratio')

        links.push({
          doubanId,
          title,
          rate,
          poster
        })
      })
    }

    return links
  })

  await browser.close()
  // console.log(result)
  process.send({result})
  process.exit(0)
})()

上一篇 下一篇

猜你喜欢

热点阅读