nodejs爬虫

2019-10-28 本文已影响0人胡哥a

nodejs相关模块

获取网页内容（http\request\superagent等）

筛选网页信息（cheerio）

输出或存储信息（console\fs\mongodb\mysql等）

1、使用 request 模块来获取网页内容

var request = require('request');
    // 通过 GET 请求来读取 http://cnodejs.org/ 的内容
    request('http://cnodejs.org/', function (error, response, body) {
        if (!error && response.statusCode == 200) {
            // 输出网页内容
            console.log(body);
        }
    });

如果是其他的请求方法，或者需要指定请求头等信息，可以在第一个参数中传入一个对象来指定，比如：

var request = require('request');
request({
    url:    'http://cnodejs.org/',   // 请求的URL
    method: 'GET',                   // 请求方法
    headers: {                       // 指定请求头
        'Accept-Language': 'zh-CN,zh;q=0.8',         // 指定 Accept-Language
        'Cookie': '__utma=4454.11221.455353.21.143;' // 指定 Cookie
    }
}, function (error, response, body) {
    if (!error && response.statusCode == 200) {
        console.log(body) // 输出网页内容
    }
});

2、使用 cheerio 模块来提取网页中的数据

cheerio 是一个 jQuery Core 的子集，其实现了 jQuery Core 中浏览器无关的 DOM 操作 API，以下是一个简单的示例：

var cheerio = require('cheerio');

// 通过 load 方法把 HTML 代码转换成一个 jQuery 对象
var $ = cheerio.load('<h2 class="title">Hello world</h2>');

// 可以使用与 jQuery 一样的语法来操作
$('h2.title').text('Hello there!');
$('h2').addClass('welcome');

console.log($.html());
// 将输出 <h2 class="title welcome">Hello there!</h2>

3、使用 mysql 模块来将数据储存到数据库

mysql 模块内置了连接池机制，以下是一个简单的使用示例：

var mysql = require('mysql');

// 创建数据库连接池
var pool  = mysql.createPool({
  host:           'localhost', // 数据库地址
  user:           'root',      // 数据库用户
  password:        '',         // 对应的密码
  database:        'example',  // 数据库名称
  connectionLimit: 10          // 最大连接数，默认为10
});

// 在使用 SQL 查询前，需要调用 pool.getConnection() 来取得一个连接
pool.getConnection(function(err, connection) {
  if (err) throw err;

  // connection 即为当前一个可用的数据库连接
});

参考文档

jquery选择器总结 https://www.cnblogs.com/xiaxuexiaoab/p/7091527.html 
nodejs爬虫 https://www.cnblogs.com/xiaxuexiaoab/p/7124956.html