2019-07-17

2019-07-17  本文已影响0人  程序学习er

import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import com.juxinli.jobscrawler.service.CleanWebService;
import lombok.extern.slf4j.Slf4j;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.stereotype.Service;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

@Slf4j
@Service
public class CleanWebServiceImpl implements CleanWebService {


    @Override
    public Object fetchNode(String pageString, String xpath) {
        HtmlCleaner hc = new HtmlCleaner();
        TagNode tn = hc.clean(pageString);
        Document dom = null;
        try {
            dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
        } catch (ParserConfigurationException e) {
            log.error(e.getLocalizedMessage(), e);
        }
        XPath xPath = XPathFactory.newInstance().newXPath();
        Object rootNode = null;
        try {
            rootNode = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
        } catch (XPathExpressionException e) {
            log.error("xpath提取出错", e);
        }
        return rootNode;
    }

    @Override
    public List<String> getNodeListByAttr(String pageString,String xpath, String attr) {
        Object rootNode = fetchNode(pageString,xpath);
        List<String> attrContentList = new ArrayList<>();
        if (rootNode instanceof NodeList) {
            NodeList nodeList = (NodeList) rootNode;
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                if (node.getAttributes().getNamedItem(attr) == null)
                    attrContentList.add("Null");
                attrContentList.add(node.getAttributes().getNamedItem(attr).getTextContent());
            }
        }
        return attrContentList;
    }

    @Override
    public List<String> getNodeList(String pageString, String xpath) {
        Object rootNode = fetchNode(pageString, xpath);
        List<String> contentList = new ArrayList<>();
        if (rootNode instanceof NodeList) {
            NodeList nodeList = (NodeList) rootNode;
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                contentList.add(node.getTextContent()!= "" ? node
                        .getTextContent() : "Null");
            }
        }
        return contentList;
    }

    /**
     * 这个只用来取页数
     * @param xpath
     * @return
     */
    @Override
    public String[] getNodeArray(String pageString, String xpath) {
        Object rootNode = fetchNode(pageString,xpath);
        String[] contentArray = new String[6];
        if (rootNode instanceof NodeList) {
            NodeList nodeList = (NodeList) rootNode;
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                if (node == null) {
                    continue;
                }
                contentArray[i] = (node.getTextContent()!= null ? node
                        .getTextContent() : "Null");
            }
        }
        return contentArray;
    }
}
上一篇下一篇

猜你喜欢

热点阅读