那些年敲过的JAVA代码

JAVA爬取邮编信息,生成XML

2019-02-28  本文已影响0人  我想专心学习

昨天用JAVA爬了欣欣旅游的邮编信息,生成XML。中间涉及到了Jdom知识,记录下爬取过程。

效果图如下所示:

<?xml version="1.0" encoding="utf-8"?>
<postcodes name="恩施市邮编信息">
  <no-address>
    <no>445003</no>
    <address>新建街二巷,巴公路一巷,民族西路二巷,新建街一巷</address>
  </no-address>
  <no-address>
    <no>445014</no>
    <address>龙凤镇大市场,龙凤镇龙凤村,龙凤镇双堰村,龙凤镇向家村</address>
  </no-address>
  <no-address>
    <no>445016</no>
    <address>白杨坪乡鲁竹坝村,白杨乡白杨街,白杨乡朝阳坡村,白杨乡董家店村</address>
  </no-address>
  .............................
  

代码如下:

import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlSpider {

    //爬邮编、地址
    static String regexNo = "<a href=\"/youbian/[\\d]+\">([\\d]+)</a>";
    static String regexAddress = "<p>([^>]+)<a href=\"/youbian/[\\d]+\">[^<]+</a></p>";
    static Pattern patternNo = Pattern.compile(regexNo);
    static Pattern patternAddr = Pattern.compile(regexAddress);
    static StringBuffer sb = new StringBuffer("");
    static ArrayList addrList = new ArrayList();
    static ArrayList noList = new ArrayList();
    public static void main(String[] args) throws Exception {
        BufferedReader bs = getMsg();
        String line = null;
        while ((line = bs.readLine()) != null) {
            sb.append(line);
        }
        //创建解析对象
        String filepath = System.getProperty("user.dir")+ File.separator+"src"+File.separator+"post.xml";
        SAXBuilder saxBuilder = new SAXBuilder();
        InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("post.xml");
        //用解析对象解析输入流生成文档对象
        Document document = saxBuilder.build(inputStream);
        Element rootElement = document.getRootElement();
        rootElement.setName("postcodes");
        rootElement.setAttribute("name","恩施市邮编信息");
        Matcher matchernNo = patternNo.matcher(sb);
        Matcher matchernAddr = patternAddr.matcher(sb);

        while (matchernNo.find()||matchernAddr.find()) {
            Element noaddress = new Element("no-address");
            Element no= new Element("no");
            if (matchernNo.find()) {
                String group = matchernNo.group(1);
                if (group != null) {
//                    System.out.println("邮编号码:" + group);
                    noaddress.addContent(no);
                    no.setText(group);
                }
            }
            if (matchernAddr.find()) {
                String group1 = matchernAddr.group(1);
//                 System.out.println("地址:" + group1);
                Element address = new Element("address");
                rootElement.addContent( noaddress);
                noaddress.addContent(address);
                address.setText(group1);
            }
        }

        //将documnet写入到硬盘
        Format format = Format.getPrettyFormat();//定义格式
        format.setEncoding("utf-8");
        XMLOutputter xmlOutputter = new XMLOutputter(format);
        try {
            xmlOutputter.output(document,new FileOutputStream(filepath));
        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    public static BufferedReader getMsg() throws IOException {
        URL url = new URL("https://tool.cncn.com/youbian/enshi-enshi");
        URLConnection connection = url.openConnection();
        //添加User-Agent
        connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36");
        return new BufferedReader(new InputStreamReader(connection.getInputStream(), "GBK"));
    }

}


上一篇下一篇

猜你喜欢

热点阅读