使用TypeScript编写爬虫工具
2022-10-08 本文已影响0人
jia林
一、安装及配置
1.npm init -y 初始化项目
2.npm i typescipt cheerio superagent PS:cheerio 进行数据提取;superagent轻量的 Ajax API
3.npx tsc --init 初始化ts配置文件
4.npm i @types/cheerio @types/superagent ts-node -D PS:@types/cheerio @types/superagent插件的声明文件;ts-node运行ts文件
5.package.json 里配置
"scripts": {
"dev": "ts-node ./src/crowller.ts"
},
实现思路
1.定义一个爬取的类Crowller
2.初始化读取html,进行解析,拿到数据并处理,写到当前文件下
3.代码如下
// crowller.ts
import fs from 'fs';
import path from 'path';
import superagent from 'superagent';
import Analysis from './analysis';
export interface Analyzer {
analyze: (html: string, filePath: string) => string;
}
// 定义一个爬虫类
class Crowller {
// 配置路径
private filePath = path.resolve(__dirname, '../data/crawlingData.json');
// 获取html内容
private async getRawHtml() {
const result = await superagent.get(this.url);
return result.text;
}
// 将数据写入json 文件
private writeFile(content: string) {
fs.writeFileSync(this.filePath, content);
}
// 初始化操作
private async init() {
const html = await this.getRawHtml();
const fileContent = this.analyzer.analyze(html, this.filePath);
this.writeFile(fileContent);
}
constructor(private url: string, private analyzer: Analyzer) {
this.init();
}
}
const url = `http://gongjialin.top/pc/`;
const analyzer = Analysis.getInstance();
new Crowller(url, analyzer);
// analysis.ts 处理html数据的文件
import fs from 'fs';
import cheerio from 'cheerio';
import { Analyzer } from './crowller';
interface Course {
title: string;
}
interface CourseResult {
time: number;
data: Course[];
}
interface Content {
[propName: number]: Course[];
}
export default class Analysis implements Analyzer {
private static instance: Analysis;
// 单例模式
static getInstance() {
if (!Analysis.instance) {
Analysis.instance = new Analysis();
}
return Analysis.instance;
}
// 对网页数据进行爬取 并处理
private getCourseInfo(html: string) {
const $ = cheerio.load(html);
const courseItems = $('.product-hover');
const courseInfos: Course[] = [];
courseItems.map((index, element) => {
const descs = $(element).find('h2');
const title = descs.eq(0).text();
courseInfos.push({ title });
});
return {
time: new Date().getTime(),
data: courseInfos
};
}
// 处理爬取的数据
private generateJsonContent(courseInfo: CourseResult, filePath: string) {
let fileContent: Content = {};
if (fs.existsSync(filePath)) {
const _data = fs.readFileSync(filePath, 'utf-8')
if(_data) {
fileContent = JSON.parse(_data);
}
}
fileContent[courseInfo.time] = courseInfo.data;
return fileContent;
}
// 解析数据
public analyze(html: string, filePath: string) {
const courseInfo = this.getCourseInfo(html);
const fileContent = this.generateJsonContent(courseInfo, filePath);
return JSON.stringify(fileContent);
}
}