使用jsoup简单抓取应用市场数据
2018-05-15 本文已影响73人
浪漫晨风
统计各个应用市场本应用的下载数量是个麻烦没有效率的要求,不符合广大程序员哥们的使用习惯,我是搞android的,深知android应用市场真是繁琐,于是百度了一下,从网上找了一个jsoup来解析网页,用于方便我们应用市场下载量的统计工作。
1.为了方便封装,便于扩展,封装了一个规则类,以及规则异常
public class Rule {
/**
* 链接
*/
private String url;
/**
* 参数集合
*/
private String[] params;
/**
* 参数对应的值
*/
private String[] values;
/**
* 对返回的HTML,第一次过滤所用的标签,请先设置type
*/
private String resultTagName;
/**
* CLASS / ID / SELECTION 设置resultTagName的类型,默认为ID
*/
private int type = ID;
/**
* GET / POST 请求的类型,默认GET
*/
private int requestMoethod = GET;
public final static int GET = 0;
public final static int POST = 1;
public final static int CLASS = 0;
public final static int ID = 1;
public Rule() {
}
public Rule(String url, String[] params, String[] values,
String resultTagName, int type, int requestMoethod) {
super();
this.url = url;
this.params = params;
this.values = values;
this.resultTagName = resultTagName;
this.type = type;
this.requestMoethod = requestMoethod;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String[] getParams() {
return params;
}
public void setParams(String[] params) {
this.params = params;
}
public String[] getValues() {
return values;
}
public void setValues(String[] values) {
this.values = values;
}
public String getResultTagName() {
return resultTagName;
}
public void setResultTagName(String resultTagName) {
this.resultTagName = resultTagName;
}
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getRequestMoethod() {
return requestMoethod;
}
public void setRequestMoethod(int requestMoethod) {
this.requestMoethod = requestMoethod;
}
}
规则异常类
public class RuleException extends RuntimeException
{
public RuleException()
{
super();
// TODO Auto-generated constructor stub
}
public RuleException(String message, Throwable cause)
{
super(message, cause);
// TODO Auto-generated constructor stub
}
public RuleException(String message)
{
super(message);
// TODO Auto-generated constructor stub
}
public RuleException(Throwable cause)
{
super(cause);
// TODO Auto-generated constructor stub
}
}
2.抓取主要类:
public class ExtractService
{
/**
* @param rule
* @return
*/
public static Element extract(Rule rule)
{
validateRule(rule);
Element result = null ;
try
{
/**
* 解析rule
*/
String url = rule.getUrl();
String[] params = rule.getParams();
String[] values = rule.getValues();
String resultTagName = rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
Connection conn = Jsoup.connect(url);
// 设置查询参数
if (params != null)
{
for (int i = 0; i < params.length; i++)
{
conn.data(params[i], values[i]);
}
}
// 设置请求类型
Document doc = null;
switch (requestType)
{
case Rule.GET:
doc = conn.timeout(100000).get();
break;
case Rule.POST:
doc = conn.timeout(100000).post();
break;
}
//处理返回数据
switch (type)
{
case Rule.CLASS:
result = doc.getElementsByClass(resultTagName).first();
break;
case Rule.ID:
result = doc.getElementById(resultTagName).firstElementSibling();
break;
}
} catch (IOException e)
{
e.printStackTrace();
}
return result;
}
/**
* 对传入的参数进行必要的校验
*/
private static void validateRule(Rule rule)
{
String url = rule.getUrl();
if (TextUtil.isEmpty(url))
{
throw new RuleException("url不能为空!");
}
if (!url.startsWith("http://"))
{
throw new RuleException("url的格式不正确!");
}
if (rule.getParams() != null && rule.getValues() != null)
{
if (rule.getParams().length != rule.getValues().length)
{
throw new RuleException("参数的键值对个数不匹配!");
}
}
}
}
3.运行主要类,通过我们网页分析工具,获取那些网站的源码,然后得到我们想要的数值
public static void main(String[] args) {
Market360();
MarketBaidu();
MarketAnzhi();
}
public static void Market360() {
Rule rule = new Rule("http://zhushou.360.cn/search/index/",
new String[] { "kw" }, new String[] { "名医网·健康E家" }, "downNum",
Rule.CLASS, Rule.GET);
System.out.println("360Market:" + ExtractService.extract(rule).text());
}
public static void MarketBaidu() {
Rule rule = new Rule("http://shouji.baidu.com/s", new String[] { "wd",
"data_type", "f" }, new String[] { "名医网·健康E家", "app",
"header_all%40input%40btn_search" }, "download-num",
Rule.CLASS, Rule.GET);
System.out
.println("baiduMarket:" + ExtractService.extract(rule).text());
}
public static void MarketAnzhi() {
Rule rule = new Rule("http://www.anzhi.com/pkg/f187_com.zzu.ehome.main.ehome.html",null, null, "spaceleft",
Rule.CLASS, Rule.GET);
System.out
.println("AnzhiMarket:" + ExtractService.extract(rule).text());
}
123.png