node 爬虫 puppeteer 使用笔记

2023-02-06 本文已影响0人 VioletJack

开启掘金成长之旅！这是我参与「掘金日新计划 · 2 月更文挑战」的第 1 天，点击查看活动详情

安装

yarn add puppeteer

// 或者
npm i puppeteer --save

简单使用

关于 puppeteer 的具体使用，我肯定没法和官方文档相比的。下面就用几个业务场景的 demo 来作为参考吧。

登录行为

这里用了简书的登录页面做个尝试。当然它是带有反爬验证的，我们无法绕过这个。但对于一些没有发爬的系统就可以顺利自动登录了。

const puppeteer = require("puppeteer");

const config = {
  username: '123456',
  password: '654321'
}

function waitForTimeout(time) {
  return new Promise((resolve, reject) => setTimeout(resolve, time));
}

const run = async () => {
  const browser = await puppeteer.launch({ headless: false }); // 如果不需要看到浏览器爬取的过程，将 headless 设置为 true 即可。
  const page = await browser.newPage();

  // 访问页面
  await page.goto("https://www.jianshu.com/sign_in");

  // 实现一个登录行为
  await page.waitForSelector("#sign-in-form-submit-btn");
  await page.evaluate((config) => {
    document.querySelector("#session_email_or_mobile_number").value = config.username,
    document.querySelector("#session_password").value = config.password;
  }, config);
  await page.click("#sign-in-form-submit-btn");

  await waitForTimeout(3000);

  await browser.close();
};

run();

屏幕截图

const puppeteer = require("puppeteer");

const run = async () => {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();

  // 访问页面
  await page.goto("https://www.jianshu.com/sign_in");

  await page.setViewport({width: 1000, height: 500})
  await page.screenshot({path: 'sign_in.png'});

  await browser.close();
};

run();

爬取接口

虽然接口爬虫可以直接通过 API 访问接口来模拟请求，但作为前端，我在在浏览器中使用 fetch 来获取接口。

而 fetch 请求的获取可以通过 chrome 的开发者工具来实现 —— 在 Network 里面找到想爬取的接口请求，右击选择【copy -> copy as fetch】就可以将目标请求的 fetch 方法复制下来了。而由于是模拟真实浏览器登录过程，所以可以直接拿到数据。

const puppeteer = require("puppeteer");

const config = {
  username: "violetjack",
  password: "111111",
};

function waitForTimeout(time) {
  return new Promise((resolve, reject) => setTimeout(resolve, time));
}

const run = async () => {
  const browser = await puppeteer.launch({ headless: false }); // 如果不需要看到浏览器爬取的过程，将 headless 设置为 true 即可。
  const page = await browser.newPage();

  // 访问页面
  await page.goto(
    "https://login.vj.com"
  );

  // 实现一个登录行为
  await waitForTimeout(2000);
  await page.evaluate((config) => {
    (document.querySelector("#email").value = config.username),
      (document.querySelector("#password").value = config.password);
  }, config);
  await waitForTimeout(1000);
  await page.click("#submit");
  await waitForTimeout(2000);

  // 抓取请求
  const res = await page.evaluate(async () => {
    const result = await fetch(
      "https://login.vj.com/api/base/v1/users/self",
      {
        headers: {
          accept: "application/json, text/plain, */*",
          "accept-language": "zh-CN,zh;q=0.9",
          baggage:
            "sentry-environment=production,sentry-release=fe-gpm%40fddf80da9e689eb52491c638789a598be600a27d,sentry-public_key=d9206c43dc47444e9d921164471da964,sentry-trace_id=eb6f04632d3a4b1a872d79c8f479b27f,sentry-sample_rate=1",
          "sec-ch-ua":
            '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
          "sec-ch-ua-mobile": "?0",
          "sec-ch-ua-platform": '"Windows"',
          "sec-fetch-dest": "empty",
          "sec-fetch-mode": "cors",
          "sec-fetch-site": "same-origin",
          "sentry-trace": "eb6f04632d3a4b1a872d79c8f479b27f-be012b103057d83e-1",
        },
        referrer:
          "https://system.vj.com/Portal/index.html",
        referrerPolicy: "strict-origin-when-cross-origin",
        body: null,
        method: "GET",
        mode: "cors",
        credentials: "include",
      }
    ).then((res) => res.json());
    return result;
  });

  console.log(res);

  await waitForTimeout(3000);

  await browser.close();
};

run();

爬取页面数据

在有些时候，我们没法通过接口来拿到数据。就要去页面上一个个找了。

小技巧：巧用 chrome 开发者工具的元素查看器，可以更快的选中元素哦。

浏览器页面右击，选中【检查】
跳出开发者工具，找到元素（Elements）标签。
找到所需的元素，右击菜单。
在菜单中选中【Copy -> Copy Selector】
通过 DOM 自带的 document.querySelector(selector) 函数获取页面数据。

下面是代码，我这里因为爬取的比较多，所以做了封装。

  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();

  // 访问页面
  await page.goto(url); // 这里输入目前页面
  await waitForTimeout(2000)
  
  const result = await page.evaluate(() => {
    const mobj = {};

    const query = (name, selector) => {
      const ele = document.querySelector(selector);
      if (ele) {
        if (ele) {
          if (ele.tagName === "INPUT") {
            mobj[name] = ele.value;
          } else if (ele.tagName === "SELECT") {
            var value = ele.value;
            var options = ele.children;
            var text = "";
            for (var option of options) {
              if (option.value === value) {
                text = option.innerText;
              }
            }
            mobj[name] = text;
          } else {
            mobj[name] = ele.innerText;
          }
        }
      }
    };

    query("申请人", "#Control11");
    query("核报金额", "#Control30");
    query("已暂支原币总额", "#Text44");
    query("冲销金额", "#Text5");
    query("业务类型", "#control5 > label");
    query('申请金额', '#div106107 > label')
    query('预支金额', '#div772675 > label')
    query('已暂支金额', '#Text77')
    query("实付金额", "#Control31");
    query("币别", "#div19950 > label");
    query("付款方式", "#Select1");
    query("付款方名称", "#Text71");
    query("收款方名称", "#control26 > label > span");
    query("付款方账号", "#Control35");
    query("收款方账号", "#Div12 > label > span");
    query("付款方开户行", "#Control37");
    query("收款方开户行", "#Div14 > label > span");

    return mobj;
  });
}

一些问题

waitForTimeout 和 waitFor 不见了

最新版本的 puppeteer 已经弃用了，可以自己简单写一个替代：

function waitForTimeout(timeout) {
  return new Promise((resolve, reject) => setTimeout(resolve, timeout));
}

目前还支持的

各种环境问题

在本地开发和服务器部署时，遇到了一些环境问题。在网上找解决方案，最终都指向了一个网址：http://pptr.dev/troubleshooting

这是 puppeteer 官方的问题解决方案，很完整。我就不拿出来搬运了~

参考资料

官方文档必然放在首位 - http://pptr.dev/
我一开始入门就是靠的这篇文章 - https://juejin.cn/post/6844903544919687181
这篇文章没有模拟浏览器，而是直接通过请求爬取知乎数据 - https://juejin.cn/post/6844903510941630471