Elasticsearch 整合 Spring Boot(1)
2020-12-17 本文已影响0人
SheHuan
前边我们基本学习了 Java High Level REST Client 一些常用的 API,接下来我们开始学习如何在 Spring Boot 项目中整合 Elasticsearch。 Spring Boot 项目中 ES 相关依赖的引入可以参考Elasticsearch 使用 Java High Level REST Client 操作索引、文档。
这里我们打算做一个相对有实际意义的例子,而是不是像之前那样自己随便造几条数据玩玩。所以我们需要先爬取一些数据,这里我们根据关键字搜索的形式来爬取京东书籍数据,例如搜索Java
,得到类似如下的页面,然后提取页面中我们想要的数据,翻页再提取:

在 Java 中做爬虫自然要使用jsoup
,在之前项目中加上如下依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
接下来就是如何采集数据了,如下是可以根据书籍关键字、翻页数来爬取数据的程序:
@Service
public class BookParseService {
/**
* @param keyword 搜索的书籍关键字
* @param maxPage 最大翻页数
*/
public void parse(String keyword, int maxPage) {
// 请求头
Map<String, String> headers = new HashMap<>();
headers.put("cookie", "__jdu=1019675532; shshshfpa=f869d8e2-69b9-358e-5dcb-477e8ef9482d-1592724368; shshshfpb=itCeNyfLZd4q0XEuY4ktKAQ%3D%3D; qrsc=3; rkv=1.0; ipLoc-djd=27-2376-2381-0; areaId=27; user-key=ba2c2f27-874b-440c-adef-2fd1d3fa2c7b; TrackID=1ISOZJemvKrBd9TLBsXzdWqMm46MVUhtg4v_nvQ_QQNDaxCFHL_4NX-dEWoV_xuoQfkW0cs-MCjoCyHmNPGXv_JdGjcbdWVEm2rvt5NQBqjTZLek4cCVCgxLEl1sULgkO; pinId=qYIsSFyBW4wlXdeRkXF5A2MUPWT6-mAV; pin=%E4%BE%9D%E7%84%B6%E8%8C%83%E7%89%B9%E8%A5%BFSH; unick=SheHuannn; ceshi3.com=201; _tp=mVV%2BvIxF36NRh0bwXdTccfTGGubTI%2FqluhhWPdnWLrJhG%2FXHI3O%2BIY080h22%2Btjo; _pst=%E4%BE%9D%E7%84%B6%E8%8C%83%E7%89%B9%E8%A5%BFSH; unpl=V2_ZzNtbUFQR0cnChFdfkpYDWIEFVkRUBAScA5FXHMZXQI3BUIOclRCFnQURlVnG1wUZAMZXUNcQRNFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsYXA1gBRZZQFRzJXI4dmRzH1wDZAIiXHJWc1chVE9UfBheSGcCElVFUUcRdwt2VUsa; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_265cc3f84b594665b6b647299106a7ab|1604149836557; cn=6; shshshfp=2939b23fe88aed5b21d8a1079fee095a; __jda=122270672.1019675532.1592724364.1604149771.1604841540.7; __jdc=122270672; 3AB9D23F7A4B3C9B=TMJSU2H6IHLA72TFXCENIWG2RLB2XS6DOQRSY5ZZ2TPJDB74PA5VRTJ4Y4MCYZKV2R5XTXLR3FJT337JXXTAYZEURE; wlfstk_smdl=u7t6rp7e9lqiwy5muw9o2m04oxuhp7nk; __jdb=122270672.9.1019675532|7.1604841540; shshshsID=a0427fc9f173d34e7f2704055dbc7b8f_7_1604843336659");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36");
int page = 1;
int total = 0;
HashMap<String, Book> bookMap;
FileWriter fileWriter = null;
try {
File file = new File(System.getProperty("user.dir"), "jd_book.txt");
if (!file.exists()) {
file.createNewFile();
}
fileWriter = new FileWriter(file, true);
// 循环翻页
while (page <= maxPage) {
// 搜索书籍的地址
String searchUrl = "https://search.jd.com/Search?keyword=" + keyword + "&page=" + page + "&s=1&click=0";
Document document;
try {
// 通过jsoup请求书籍数据
document = Jsoup.connect(searchUrl).timeout(60 * 1000).headers(headers).get();
} catch (Exception e) {
continue;
}
Elements list = document.select("ul.gl-warp.clearfix > li > div.gl-i-wrap");
if (list.size() == 0) {
break;
}
bookMap = new HashMap<>();
// 保存请求评论数的参数
StringBuilder referenceIds = new StringBuilder();
for (Element item : list) {
// 书名
String name = item.select("div.p-name > a > em").get(0).text();
// 价格
Float price = Float.valueOf(item.select("div.p-price i").text());
// 店铺
String shop = item.select("div.p-shopnum > a").text();
if (StringUtils.isEmpty(shop)) {
shop = item.select("div.p-shop a").text();
}
// 图片
String img = item.select("div.p-img > a > img").attr("data-lazy-img");
String skuId = item.select("div.p-commit a").attr("id").substring(10);
// 作者
String author = "";
// 出版社
String publisher = "";
Elements bookDetails = item.select("div.p-bookdetails");
if (bookDetails != null) {
author = bookDetails.select("span.p-bi-name > a").text();
publisher = bookDetails.select("span.p-bi-store > a").text();
}
// 封装book对象
Book book = new Book();
book.setName(name);
book.setPrice(price);
book.setShop(shop);
book.setImg(img);
book.setSkuId(skuId);
book.setAuthor(author);
book.setPublisher(publisher);
// 保存到map,方便后边设置评论数
bookMap.put(skuId, book);
// 批量请求多个书籍评论数的参数
referenceIds.append(skuId).append(",");
}
// 请求评论数的地址
String commentUrl = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + referenceIds;
String body = "";
try {
// 通过jsoup请求评论数据
body = Jsoup.connect(commentUrl).timeout(60 * 1000).headers(headers).get().body().text();
} catch (Exception e) {
continue;
}
JSONObject jsonObject = JSONObject.parseObject(body);
JSONArray jsonArray = jsonObject.getJSONArray("CommentsCount");
for (int i = 0; i < jsonArray.size(); i++) {
String skuId = jsonArray.getJSONObject(i).getString("SkuId");
Integer commentCount = jsonArray.getJSONObject(i).getInteger("CommentCount");
// 设置评论数
bookMap.get(skuId).setCommentCount(commentCount);
// 将数据保存在文件中
fileWriter.write(JSONObject.toJSONString(bookMap.get(skuId)));
fileWriter.write("\n");
System.out.println(JSONObject.toJSONString(bookMap.get(skuId)) + "\n");
}
total += list.size();
System.out.println("采集完成的页数:" + page);
System.out.println("已采集数据条数:" + total);
++page;
// 休眠2秒再翻页
Thread.sleep(2 * 1000);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
......
}
}
}
注意,京东的网页可能会调整,导致某些 html 标签失效,采集不到数据,这样的话就需要自己去调整代码了。
在单元测试中执行上边的方法来采集一些数据:
@RunWith(SpringRunner.class)
@SpringBootTest
class LearnElasticsearchApplicationTests {
@Autowired
BookParseService bookParseService;
@Test
void testES() throws IOException {
bookParseService.parse("小说", 400);
bookParseService.parse("Java开发", 30);
bookParseService.parse("Android开发", 20);
bookParseService.parse("iOS开发", 20);
bookParseService.parse("Python开发", 20);
bookParseService.parse("前端开发", 20);
bookParseService.parse("诗词", 30);
bookParseService.parse("法律", 30);
bookParseService.parse("军事", 30);
bookParseService.parse("经济", 30);
bookParseService.parse("历史", 30);
}
}
执行完后,初步采集到的数据就保存在jd_book.txt
里了,但是会有一些重复的数据,我们可以根据skuId
来去重:
@Service
public class BookFileService {
/**
* 根据 skuId 将采集到的原始数据去重
*/
public void removeSameBookData() {
// 原始数据文件路径
String filePath = System.getProperty("user.dir") + File.separator + "jd_book.txt";
FileReader fileReader = null;
BufferedReader bufferedReader = null;
PrintStream printStream = null;
try {
Set<String> skuIdSet = new HashSet<>();
// 创建去重后的文件
File file = new File(System.getProperty("user.dir"), "jd_book2.txt");
if (file.exists()) {
file.delete();
}
file.createNewFile();
printStream = new PrintStream(new FileOutputStream(file));
fileReader = new FileReader(filePath);
bufferedReader = new BufferedReader(fileReader);
String line;
while ((line = bufferedReader.readLine()) != null) {
Book book = JSON.parseObject(line, Book.class);
if (!skuIdSet.contains(book.getSkuId())) {
skuIdSet.add(book.getSkuId());
// 将不重复的数写入新文件
printStream.println(line);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
......
}
}
}
通过单元测试执行去重的方法:
@RunWith(SpringRunner.class)
@SpringBootTest
class LearnElasticsearchApplicationTests {
@Autowired
BookFileService bookFileService;
@Test
void testES() throws IOException {
bookFileService.removeSameBookData();
}
}
jd_book2.txt
里边就是去重后的数据了,里边共有11922条数据,数据格式是一行行 JSON 字符串,例如:
{"author":"(日)东野圭吾","commentCount":1214616,"img":"//img13.360buyimg.com/n1/s200x200_jfs/t1/102681/37/15355/229270/5e708291E88e39fdd/c00b26e445e830dc.jpg","name":"东野圭吾·沉默的巡游(2020全新力作 中文简体版初次上市)","price":53.1,"publisher":"南海出版公司","shop":"新经典文化京东自营店","skuId":"12817948"}
{"author":"莫言","commentCount":99165,"img":"//img12.360buyimg.com/n1/s200x200_jfs/t1/120921/10/8171/274287/5f214206E2f4cc0d9/dc161934a4559c17.jpg","name":"晚熟的人 莫言新书","price":57.2,"publisher":"人民文学出版社","shop":"人民文学出版社","skuId":"12699287"}
爬虫不是我们的重点内容,不熟悉也没关系,上边已将采集到的数据保存到文件中了,方便大家直接使用。接下来就可以读文件将数据添加到 ES 了。
本文详细的代码可以参考:https://github.com/SheHuan/LearnElasticsearch