Nodejs 异步流程控制及若干细节

2017-02-28 本文已影响281人 jarvan4dev

我有酒，你有故事吗？

今天接了个爬虫任务，主要是从网页上将数据爬下来，规整后导出到Excel。以前工作中的爬虫都是基于HttpClient+jsoup，很早就知道Nodejs有cheerio，HTML和JavaScript天生的一对，拿Nodejs去做网页爬虫很简单，有多简单呢？就这么说吧，和你用jQuery没什么两样。所以选择了Nodejs。

不涉及保密事件，故源码已托管GitHub

开工

mkdir crawler && cd crawler
npm init
MacBook-Pro:crawler$ npm init
This utility will walk you through creating a package.json file.
It only covers the most common items, and tries to guess sensible defaults.

See `npm help json` for definitive documentation on these fields
and exactly what they do.

Use `npm install <pkg> --save` afterwards to install a package and
save it as a dependency in the package.json file.

Press ^C at any time to quit.
name: (crawler) crawler
version: (1.0.0) 
description: Nodejs crawler
entry point: (index.js) 
test command: 
git repository: 
keywords: 
author: jarvan4dev@163.com
license: (ISC) 
About to write to /Users/jarvan4dev/Documents/test/crawler/package.json:

{
  "name": "crawler",
  "version": "1.0.0",
  "description": "Nodejs crawler",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "jarvan4dev@163.com",
  "license": "ISC"
}


Is this ok? (yes) yes

npm install - 安装依赖包

npm i cheerio --save
npm i excel-export --save
npm i request --save

简要介绍下这几个依赖包：

cheerio，类似于Java中的jsoup，cheerio的部分就自己看吧。
excel-export，Excel操作工具
request，网络请求工具

小试牛刀

拿人保寿险深入

直接上代码

/**
 * Created by jarvan4dev on 2017/2/28.
 * 人保爬虫
 */
const http = require('http');
const cheerio = require('cheerio');
const request = require('request');

const companyName = '人保寿险';
const subName = '分公司';

const headers = [{
    caption: '省',
    type: 'string'
}, {
    caption: '机构名称',
    type: 'string'
}, {
    caption: '营业场所',
    type: 'string'
},{
    caption: '邮编',
    type: 'string'
}, {
    caption: '电话',
    type: 'string'
}];

const fileName = 'renbao.xlsx';

let rows = [];

const exportUtils = require('../utils/exportUtils');

http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
    let html = '';
    res.setEncoding('utf-8'); //防止中文乱码
    res.on('data', data => {
        html += data;
    });
    res.on('end', () => {
        let $ = cheerio.load(html); //采用cheerio模块解析html
        $('.fgs_nr').each((index, element) => {
            let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
            let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
            let province = subComName.replace(companyName, '').replace(subName, '').trim();
            let location = $(element).find('.fgs_add_qg').attr('title').trim();
            let zipCode = $(element).find('.fgs_zc_qg').text().trim();
            let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
            rows.push([province, subComName, location, zipCode, tel]);
            http.get(nextLink, res => {
                let nextHtml = '';
                res.setEncoding('utf-8'); //防止中文乱码
                res.on('data', data => {
                    nextHtml += data;
                });
                res.on('end', () => {
                    let $$ = cheerio.load(nextHtml);
                    $$('.fgs_nr').each((i, ele) => {
                        subComName = $$(ele).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
                        location = $$(ele).find('.fgs_add_qg2').attr('title').trim();
                        zipCode = $$(ele).find('.fgs_zc_qg').text().trim();
                        tel = $$(ele).find('.fgs_tel_qg').attr('title').trim();
                        rows.push([province, subComName, location, zipCode, tel]);
                    });
                });
            });
        });
        exportUtils.excelWrite(headers, rows, fileName);
    });
});

是不是觉得大功告成？Too young too simple！别忘了http请求是异步的（request也是），这就明显有问题，在外层for循环中第一次执行rows.push([province, subComName, location, zipCode, tel])时，由于http请求是异步的，所以完全可能第二次请求还没完成，就执行了exportUtils.excelWrite(headers, rows, fileName)（ps: 在这里吐槽下简书代码不带行号的问题）。

下面看下我的改造，我的想法很简（chun）单（ben），我这样想的，既然第一for循环内部可能存在异步的http请求，那就让它从这里面脱离，直接让第一个for完全执行结束，注意看links变量。

/**
 * Created by jarvan4dev on 2017/2/28.
 * 人保爬虫
 */
const http = require('http');
const cheerio = require('cheerio');
const request = require('request');

const companyName = '人保寿险';
const subName = '分公司';

const headers = [{
    caption: '省',
    type: 'string'
}, {
    caption: '机构名称',
    type: 'string'
}, {
    caption: '营业场所',
    type: 'string'
},{
    caption: '邮编',
    type: 'string'
}, {
    caption: '电话',
    type: 'string'
}];

const fileName = 'renbao.xlsx';

let rows = [];
let links = [];

const exportUtils = require('../utils/exportUtils');

http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
    let html = '';
    res.setEncoding('utf-8'); //防止中文乱码
    res.on('data', data => {
        html += data;
    });
    res.on('end', () => {
        let $ = cheerio.load(html); //采用cheerio模块解析html
        $('.fgs_nr').each((index, element) => {
            let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
            let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
            let province = subComName.replace(companyName, '').replace(subName, '').trim();
            let location = $(element).find('.fgs_add_qg').attr('title').trim();
            let zipCode = $(element).find('.fgs_zc_qg').text().trim();
            let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
            links.push({'province': province, 'nextLink': nextLink});
            rows.push([province, subComName, location, zipCode, tel]);
        });
        links.forEach((index, link) => {
            http.get(link, res => {
                let html = '';
                res.setEncoding('utf-8'); //防止中文乱码
                res.on('data', data => {
                    html += data;
                });
                res.on('end', () => {
                    let $ = cheerio.load(html);
                    $('.fgs_nr').each((i, element) => {
                        let subComName = $(element).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
                        let location = $(element).find('.fgs_add_qg2').attr('title').trim();
                        let zipCode = $(element).find('.fgs_zc_qg').text().trim();
                        let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
                        rows.push([link.province, subComName, location, zipCode, tel]);
                    });
                });
            });
        });
        exportUtils.excelWrite(headers, rows, fileName);
    });
});

其实然并卵... 关于第二种写法，我想了一个解决办法：

借助node的EventEmitter，在第一个forEach结束的时候记录下第二个forEach应该执行的次数，即变量links的长度，记做全局变量 count。在第二层for循环的res.on('end')中每完整执行一次网络请求并正确解析数据后，count--，当count减为0时触发一个事件，在外部监听这个事件，然后对数据进行处理即可。
参考网上解决方案：

function walk (path, handleFile, callback) {
  var len = 1,       // 文件|目录数，起始一个
      floor = 0;     // 第x个目录？

  function done () {
  // 完成任务, 运行回调函数
      if (--len === 0) {
          callback();
      }
  }

  function composeErr (err) {
  // 错误处理
      console.log('stat error');
      done();  // 以错误内容完成
  }

  function composeDir (path) {
  // 目录处理
      floor++;
      fs.readdir(path, function (err, files) {
          if (err) {
              console.log('read dir error');
              done();  // 目录完成 
              return;
          }
          len += files.length;  // 子文件|子目录计数
          files.forEach(function (filename) {
              compose(path + '/' + filename);  // 子内容新的操作
          });
          done();  // 目录完成
      });
  }

  function composeFile (path) {
  // 文件处理
      handleFile(path, floor);
      done();  // 文件完成
  }

  function compose (path) {
      fs.stat(path, function (err, stats) {
          if (err) {
              composeErr(err);
              return;
          }

          if (stats.isDirectory()) {
              composeDir(path);
              return;
          }

          composeFile(path);
      });
  }

  compose(path);
}

其实方式二就是自己实现异步流程控制，其实有更好的方法 --- async。

祭出杀器 --- async

使用async做异步流程控制，代码会优雅很多。

/**
 * Created by jarvan4dev on 2017/2/28.
 * 人保爬虫
 */
const http = require('http');
const cheerio = require('cheerio');
const async = require('async');

const companyName = '人保寿险';
const subName = '分公司';

const headers = [{
    caption: '省',
    type: 'string'
}, {
    caption: '机构名称',
    type: 'string'
}, {
    caption: '营业场所',
    type: 'string'
},{
    caption: '邮编',
    type: 'string'
}, {
    caption: '电话',
    type: 'string'
}];

const fileName = 'renbao.xlsx';

let rows = [];
let links = [];

const exportUtils = require('../utils/exportUtils');

http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
    let html = '';
    res.setEncoding('utf-8'); //防止中文乱码
    res.on('data', data => {
        html += data;
    });
    res.on('end', () => {
        let $ = cheerio.load(html); //采用cheerio模块解析html
        $('.fgs_nr').each((index, element) => {
            let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
            let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
            let province = subComName.replace(companyName, '').replace(subName, '').trim();
            let location = $(element).find('.fgs_add_qg').attr('title').trim();
            let zipCode = $(element).find('.fgs_zc_qg').text().trim();
            let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
            links.push({'province': province, 'nextLink': nextLink});
            rows.push([province, subComName, location, zipCode, tel]);
        });
        async.each(links, (link, callback) => {
            http.get(link.nextLink, res => {
                let html = '';
                res.setEncoding('utf-8'); //防止中文乱码
                res.on('data', data => {
                    html += data;
                });
                res.on('end', () => {
                    let $ = cheerio.load(html);
                    $('.fgs_nr').each((i, element) => {
                        let subComName = $(element).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
                        let location = $(element).find('.fgs_add_qg2').attr('title').trim();
                        let zipCode = $(element).find('.fgs_zc_qg').text().trim();
                        let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
                        rows.push([link.province, subComName, location, zipCode, tel]);
                    });
                    callback();
                });
            });
        }, err => {
            // 此时所有的循环结束
            exportUtils.excelWrite(headers, rows, fileName);
        });
    });
});

或者

/**
 * Created by jarvan4dev on 2017/2/28.
 * 人保爬虫
 */
const cheerio = require('cheerio');
const request = require('request');
const async = require('async');

const companyName = '人保寿险';
const subName = '分公司';

const headers = [{
    caption: '省',
    type: 'string'
}, {
    caption: '机构名称',
    type: 'string'
}, {
    caption: '营业场所',
    type: 'string'
},{
    caption: '邮编',
    type: 'string'
}, {
    caption: '电话',
    type: 'string'
}];

const fileName = 'renbao.xlsx';

let rows = [];

const exportUtils = require('../utils/exportUtils');

request('http://www.picclife.com/aboutUsBranch.jhtml', (error, response, body) => {
    let $ = cheerio.load(body); //采用cheerio模块解析html
    async.eachSeries($('.fgs_nr'), (element, callback) => {
        let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
        let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
        let province = subComName.replace(companyName, '').replace(subName, '').trim();
        let location = $(element).find('.fgs_add_qg').attr('title').trim();
        let zipCode = $(element).find('.fgs_zc_qg').text().trim();
        let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
        rows.push([province, subComName, location, zipCode, tel]);
        request(nextLink, (err, res, subBody) => {
            let $$ = cheerio.load(subBody);
            $$('.fgs_nr').each((i, ele) => {
                subComName = $$(ele).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
                location = $$(ele).find('.fgs_add_qg2').attr('title').trim();
                zipCode = $$(ele).find('.fgs_zc_qg').text().trim();
                tel = $$(ele).find('.fgs_tel_qg').attr('title').trim();
                rows.push([province, subComName, location, zipCode, tel]);
            });
            callback();
        });
    }, err => {
        console.log(rows.length);
        exportUtils.excelWrite(headers, rows, fileName);
    });
});

这两种写法只是方式一用的是原生http模块，方式二用的是request包，另外请注意：each和eachSeries，后者是串行的，能够保证顺序。更多关于async请参看官方文档

参考文档：
Nodejs异步流程控制Async

关于excel-export

贴出我的导出文件的工具类吧！

/**
 * Created by jarvan4dev on 2017/2/28.
 */
const excelExport = require('excel-export');
const fs = require('fs');
const path = require('path');

// 导出Excel
exports.excelWrite = (headers, rows, fileName) => {
    let conf ={};
    conf.name = fileName;
    conf.cols = [];
    for(let i = 0; i < headers.length; i++){
        let col = {};
        col.caption = headers[i].caption;
        col.type = headers[i].type;
        conf.cols.push(col);
    }
    conf.rows = rows;
    let result = excelExport.execute(conf);
    let filePath = path.join('/Users/jarvan4dev/Documents', fileName);
 // appendFile 可以当文件不存在的时候自动创建
    fs.appendFile(filePath, result, 'binary',function(err){
        if(err){
            console.log(err);
        }
        console.log('saved')
    });
};

源码放在GitHub上，nodejs-crawler，动动手指，star一下！

Nodejs 异步流程控制及若干细节

开工

npm install - 安装依赖包

小试牛刀

拿富德生命开刀

拿人保寿险深入

祭出杀器 --- async

关于excel-export

猜你喜欢

热点阅读