Java爬虫实现,HttpClient整合JSoup
从春节到现在,新型冠状病毒像一股含有毒气的烟雾,一直笼罩在人们的头顶上空,并且近期没有散去的迹象,真令人堪忧呀。每天在微信、支付宝刷着疫情最新消息,希望确诊、疑似、死亡病例较昨日能够下降一些,也无意中发现他们是从各地方的卫健委获取数据的,他们是怎么获取疫情的这些统计数据的呢?作为程序员不由得想试一试。
首先,在pom文件中引入HttpClient、JSoup的架包;
<!-- 数据抓取架包start -->
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- 数据抓取架包end -->
第二步,把HttpClient的get方法,封装成一个工具类;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 数据抓取工具类
* @author 程就人生
* @Date
*/
public class GetResult {
private static Logger log = LoggerFactory.getLogger(GetResult.class);
/**
* 读取url地址,对于非utf-8页面进行编码
* @param url
* @return
*/
public static String getResult(String url){
//这里用了try-with-resource语法,在try()括号中的资源会在try语句块执行完之后自动释放
try (
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
CloseableHttpResponse response = httpClient.execute(new HttpGetConfig(url))
){
HttpEntity httpEntity = response.getEntity();
String result = null;
log.info(httpEntity.getContentType().toString());
//不是utf-8时,进行编码设置
if(!httpEntity.getContentType().getValue().contains("utf-8")
&&!httpEntity.getContentType().getValue().contains("UTF-8") && !httpEntity.getContentType().getValue().contains("gb2312")){
result = EntityUtils.toString(httpEntity, "utf-8");
}else{
result = EntityUtils.toString(httpEntity);
}
return result;
} catch (Exception e) {
log.info("获取失败");
return "";
}
//所以不需要再finally中释放资源。
}
/**
* 读取url地址,对于非utf-8页面进行编码,根据网页来源设置固定的编码
* @param url
* @param encode
* @return
*/
public static String getResult(String url, String encode){
//这里用了try-with-resource语法,在try()括号中的资源会在try语句块执行完之后自动释放
try (
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
//.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362").build();
CloseableHttpResponse response = httpClient.execute(new HttpGetConfig(url))
){
HttpEntity httpEntity = response.getEntity();
String result = null;
log.info(httpEntity.getContentType().toString());
result = EntityUtils.toString(httpEntity,encode);
return result;
} catch (Exception e) {
log.info("获取失败");
return "";
}
//所以不需要再finally中释放资源。
}
}
//内部类,继承HttpGet,为了设置请求超时的参数
class HttpGetConfig extends HttpGet {
public HttpGetConfig(String url) {
super(url);
setDefaulConfig();
}
private void setDefaulConfig() {
this.setConfig(RequestConfig.custom()
//连接请求超时
.setConnectionRequestTimeout(10000)
//连接超时
.setConnectTimeout(10000)
//socket超时
.setSocketTimeout(10000).build());
//请求头配置
this.setHeader("User-Agent", "spider");
//解决403 Forbidden问题
this.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362");
//this.setHeader("Referer","http://,,,");
}
}
JSoup也是可以用来抓取网页的,但是没有HttpClient专业;JSoup更善于解析页面上的内容,下面就看看如果会使用JSoup来解析网页吧。
第三步,使用工具类,获取页面的内容;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import com.example.demo.util.GetResult;
/**
* 安徽卫健委数据抓取
* @author FengJuan
* @Date
*/
@Service
public class AnhuiCapture {
String virusName = "新型冠状病毒";
private static Logger log = LoggerFactory.getLogger(AnhuiCapture.class);
public void getContentFromUrl() {
try {
String url = "http://wjw.ah.gov.cn/news_list_477_1.html";
String date = "2月9日";
String content = GetResult.getResult(url);
//得到document对象
Document doc = Jsoup.parse(content);
//获取父级元素
Element newsList = doc.select(".list").get(0);
//获取第一个a
String title = newsList.select("a").get(0).text();
if(title.contains(date) && title.contains(virusName)){
String dataUrl = "http://wjw.ah.gov.cn/" + newsList.select("a").get(0).attr("href");
content = GetResult.getResult(dataUrl);
doc = Jsoup.parse(content);
newsList = doc.select("#art_content").get(0);
//对span标签进行循环处理
Elements elements = newsList.select("span");
List<Map<String,Object>> infoList = new ArrayList<>();
Map<String,Object> statisticsInfo = null;
//新增确诊
content = elements.get(0).text();
String[] array = content.substring(content.indexOf(",")+1).split("、");
//数量
String num = null;
String secName = null;
for(String arr : array){
statisticsInfo = new HashMap<String,Object>();
statisticsInfo.put("firstName", "安徽省");
num = arr.replaceAll("[^0-9]","");
//市区名称
secName = arr.substring(0, arr.indexOf(num));
statisticsInfo.put("secName", secName);
//设置新增确诊的数量
statisticsInfo.put("newConfirm", num);
infoList.add(statisticsInfo);
}
//新增疑似
content = elements.get(1).text();
array = content.substring(content.indexOf(",")+1).split("、");
boolean isExist = false;
for(String arr : array){
//默认不存在
isExist = false;
num = arr.replaceAll("[^0-9]","");
//市区名称
secName = arr.substring(0, arr.indexOf(num));
for(Map<String,Object> s : infoList){
//市区名称已经存在时
if(s.get("secName").equals(secName)){
//新增疑似的数量
s.put("newSuspected", num);
isExist = true;
break;
}
}
//不存在时,新增
if(!isExist){
statisticsInfo = new HashMap<String,Object>();
statisticsInfo.put("firstName", "安徽省");
//市区名称
statisticsInfo.put("secName", secName);
statisticsInfo.put("newSuspected", num);
infoList.add(statisticsInfo);
}
}
//新增治愈
content = elements.get(2).text();
array = content.substring(content.indexOf(",")+1).split("、");
for(String arr : array){
//默认不存在
isExist = false;
num = arr.replaceAll("[^0-9]","");
//市区名称
secName = arr.substring(0, arr.indexOf(num));
for(Map<String,Object> s : infoList){
//市区名称已经存在时
if(s.get("secName").equals(secName)){
//新增治愈的数量
s.put("newCure", num);
isExist = true;
break;
}
}
//不存在时,新增
if(!isExist){
statisticsInfo = new HashMap<String,Object>();
statisticsInfo.put("firstName", "安徽省");
//市区名称
statisticsInfo.put("secName", secName);
statisticsInfo.put("newCure", num);
infoList.add(statisticsInfo);
}
}
//新增死亡
content = elements.get(3).text();
array = content.substring(content.indexOf(",")+1).split("、");
for(String arr : array){
//默认不存在
isExist = false;
num = arr.replaceAll("[^0-9]","");
//市区名称
secName = arr.substring(0, arr.indexOf(num));
for(Map<String,Object> s : infoList){
//市区名称已经存在时
if(s.get("secName").equals(secName)){
//新增治愈的数量
s.put("newDeath", num);
isExist = true;
break;
}
}
//不存在时,新增
if(!isExist){
statisticsInfo = new HashMap<String,Object>();
statisticsInfo.put("firstName", "安徽省");
//市区名称
statisticsInfo.put("secName", secName);
statisticsInfo.put("newDeath", num);
infoList.add(statisticsInfo);
}
}
for(Map<String,Object> s : infoList){
log.info(s.get("firstName").toString() + s.get("secName").toString()
+ ",新增确诊:" + (s.containsKey("newConfirm")?s.get("newConfirm").toString():"0")
+ ",新增疑似:" + (s.containsKey("newSuspected")?s.get("newSuspected").toString():"0")
+ ",新增出院:" + (s.containsKey("newCure")?s.get("newCure").toString():"0")
+ ",新增死亡:" + (s.containsKey("newDeath")?s.get("newDeath").toString():"0"));
}
}
}catch(Exception e){
e.printStackTrace();
log.error("安徽省统计数据获取异常!");
}
}
}
最后,启动项目,查看测试结果;
可能遇到的问题:
1.网页上明明设置的utf-8编码,抓取后却没有编码,导致抓到的内容是乱码;这时可以通过在工具类GetResult 中加入utf-8的编码;
2.网页中设置的编码是gb2312,抓取后再设置为utf-8,获取到的还是乱码;这时可以通过在工具类GetResult 中设置为gb2312的编码,就不会乱码了;
3.在获取某些网页时,会遇到403问题,被拒绝访问,这时也可以通过在工具类中的header中增加User-Agent属性来解决;
4.在解析页面中遇到的问题,比如在页面中使用JavaScript书写的标签,这就需要使用JSoup类来解决了。
5.总共有30多个省市,每个省市网页上的排版规则也不相同;现在只抓取了这一个省,就写了如此多的代码,大约两百行的代码;如果全国每个省的数据都抓取过来,页面排版变了,标签的规则也变了,一动百动,后面有的维护了,这真是一个事情,不知道有什么更好的办法?
参考文档:
https://blog.csdn.net/River_sum/article/details/82533648
https://blog.csdn.net/ITNoobie/article/details/48262785
https://www.open-open.com/jsoup/