微信文章爬取

2019-08-05  本文已影响0人  秦汉邮侠
 public void process(Page page) {
        String rawText = page.getRawText();
        Html html = page.getHtml();
        String regEx =  "(,s=\")\\d{4}-\\d{2}-\\d{2}(\";)";
        Pattern p = Pattern.compile(regEx);
        Matcher matcher = p.matcher(rawText);

        if (matcher.find()) {
            String res =  matcher.group();
            String[] array = res.split("\"");
            String str = array[1];
        }
        String title = html.xpath("//h2[@class='rich_media_title']/text()").toString();
        String content = html.xpath("//div[@class='rich_media_content ']").toString();

        Html html1 = new Html(content);

        List<String> imageList = html1.xpath("//img/@data-src").all();

        System.out.println("ok");


        System.out.println("hello");
    }
上一篇 下一篇

猜你喜欢

热点阅读