【Java】用Applet实现百度关键词查询结果数的抓取

2017-01-02  本文已影响0人  而已Eryee

Java课的小作业~ 通过url获取百度搜索结果页面HTML,并通过正则表达式取得其中的结果个数。


package web;

import java.io.*;

import java.net.*;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.applet.Applet;

import java.net.*;

import java.awt.*;

import java.awt.event.*;

public class GetUrl extends Applet implements ActionListener {

TextField keyword = new TextField(30); // 定义搜索的关键字

TextField show = new TextField(30);

Choice EngineName; // 使用的搜索引擎列表,使用下拉框

Button go = new Button("开始搜索");

public void init() {

setBackground(Color.white); // 设置背景为白色以便配合网页色彩

keyword = new TextField(20);

show = new TextField(20);

EngineName = new Choice();

EngineName.addItem("百度搜索");

// EngineName.addItem("搜狐");

// EngineName.addItem("有道搜索");

EngineName.select(0); // 设置缺省显示的项目为 "百度搜索"

add(keyword);

add(show);

add(EngineName);

add(go);

go.addActionListener(this);

}

public void actionPerformed(ActionEvent e) {

if (e.getSource() == go) {

try {

goSearch();

} catch (Exception e1) {

showStatus("搜索时发生异常:" + e1.toString());

}

}

}

public void goSearch() throws Exception {

Graphics g = null;

String str = keyword.getText();

if (str.equals("")) {

showStatus("请填写搜索的关键字!");

return;

}

String url = "http://www.baidu.com/s?wd=";

url +=URLEncoder.encode(str,"UTF-8");// 将关键字编码成URL格式

URL u = new URL(url);

showStatus("正在连接搜索引擎" + url);

String geturl = GetData(u);

Pattern pattern = Pattern.compile("([\u76f8|\u5173|\u7ed3|\u679c|\u7ea6]{5})(.+)(\u4e2a)");//正则式,似乎不使用Unicode编码也可以

Matcher matcher = pattern.matcher(geturl);

if (matcher.find( )) {

showStatus("获取完毕");

show.setText(" " + matcher.group());

} else {

showStatus("没有结果");

}

//getAppletContext().showDocument(new URL(url), "_blank");

}

// public void paint (Graphics g)

//   {

//       g.drawString ("Hello World", 25, 50);

//   }

public static String GetData(URL url) throws Exception{

InputStream in = url.openStream();

byte[] data = readInputStream(in);

String htmldata = new String(data,"utf-8");//输入输出流重编码,与页面编码一致,否则乱码

return htmldata;

}

public static byte[] readInputStream(InputStream in) throws Exception{

ByteArrayOutputStream out = new ByteArrayOutputStream();

byte[] buffer = new byte[4096];

int bytes;

while((bytes = in.read(buffer))!= -1){

out.write(buffer,0,bytes);

}

in.close();

return out.toByteArray();

}

}

以上です

上一篇 下一篇

猜你喜欢

热点阅读