大数据学习

Java操作Solr —— SolrJ

2020-07-08  本文已影响0人  xiaogp

摘要:SolrJava分页游标

引入依赖

引入依赖

<dependency>
            <groupId>org.apache.solr</groupId>
            <artifactId>solr-solrj</artifactId>
            <version>4.10.0</version>
        </dependency>
<dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.1.1</version>
        </dependency>
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;

import java.io.IOException;

public class solrj_test {

    private static final String SOLR_BASE_URL = "http://localhost:8080/solr/collection3";
    static HttpSolrServer solrServer = new HttpSolrServer(SOLR_BASE_URL);

    public static void main(String[] args) throws IOException, SolrServerException {
//        insertTest(solrServer);
//        deleteTest(solrServer);
//        queryTest(solrServer);
//        sorttest(solrServer);
//        insertManyTest(solrServer);
//        pageTest(solrServer);
//        filterTest(solrServer);
//        countTest(solrServer);
        query2Test(solrServer);
    }



    /**
     * 插入单条数据
     *
     * @param solrServer
     * @throws IOException
     * @throws SolrServerException
     */
    public static void insertTest(HttpSolrServer solrServer) throws IOException, SolrServerException {
        //创建文档对象
        SolrInputDocument document = new SolrInputDocument();
        document.addField("id", "test");
        document.addField("ent", "测试solrj");
        //向solr服务器写入文档
        solrServer.add(document);
        solrServer.commit();
    }

    /**
     * 插入多条数据
     *
     * @param solrServer
     * @throws IOException
     * @throws SolrServerException
     */
    public static void insertManyTest(HttpSolrServer solrServer) throws IOException, SolrServerException {
        String[][] data = {{"test2", "b"}, {"test3", "d"}};
        for (String[] a : data) {
            SolrInputDocument document = new SolrInputDocument();
            document.addField("id", a[0]);
            document.addField("ent", a[1]);
            solrServer.add(document);
        }
        solrServer.commit();
    }


    /**
     * 删除单条数据
     *
     * @param solrServer
     * @throws IOException
     * @throws SolrServerException
     */
    public static void deleteTest(HttpSolrServer solrServer) throws IOException, SolrServerException {
        solrServer.deleteByQuery("id:test");
        // 提交
        solrServer.commit();
    }

    /**
     * 检索数据
     *
     * @param solrServer
     * @throws SolrServerException
     */
    public static void queryTest(HttpSolrServer solrServer) throws SolrServerException {
        SolrQuery params = new SolrQuery("*:*");
        //执行查询,获取响应数据
        QueryResponse response = solrServer.query(params);
        //获取数据结果集
        SolrDocumentList list = response.getResults();
        System.out.println("一共获取了" + list.size() + "条结果:");

        for (SolrDocument solrDocument : list) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent:" + solrDocument.getFieldValue("ent"));
        }
    }

    /**
     * 排序
     *
     * @param solrServer
     * @throws IOException
     * @throws SolrServerException
     */
    public static void sorttest(HttpSolrServer solrServer) throws SolrServerException {
        SolrQuery params = new SolrQuery("*:*");
        params.setSort("id", SolrQuery.ORDER.desc);
        QueryResponse response = solrServer.query(params);
        SolrDocumentList list = response.getResults();
        System.out.println("一共获取了" + list.size() + "条结果:");

        for (SolrDocument solrDocument : list) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent:" + solrDocument.getFieldValue("ent"));
        }
    }


    public static void pageTest(HttpSolrServer solrServer) throws SolrServerException {
        SolrQuery params = new SolrQuery("*:*");
        params.setStart(10);
        params.setRows(5);
        QueryResponse response = solrServer.query(params);
        SolrDocumentList list = response.getResults();
        System.out.println("一共获取了" + list.size() + "条结果:");

        for (SolrDocument solrDocument : list) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent: " + solrDocument.getFieldValue("ent"));
        }
    }


    /**
     * 测试过滤条件
     * @param solrServer
     * @throws SolrServerException
     */
    public static void filterTest(HttpSolrServer solrServer) throws SolrServerException {
        SolrQuery params = new SolrQuery("*:*");
        params.setFilterQueries("id:[2 TO *]");
        QueryResponse response = solrServer.query(params);
        SolrDocumentList list = response.getResults();

        for (SolrDocument solrDocument : list) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent: " + solrDocument.getFieldValue("ent"));
        }

    }


    /**
     * 测试获取总条数
     * @param solrServer
     * @throws SolrServerException
     */
    public static void countTest(HttpSolrServer solrServer) throws SolrServerException {
        SolrQuery params = new SolrQuery("*:*");
        QueryResponse response = solrServer.query(params);
        Long foundNum = response.getResults().getNumFound();
        System.out.println(foundNum);
    }


    /**
     * 测试精确查询和模糊查询
     * @param solrServer
     * @throws SolrServerException
     */
    public static void query2Test(HttpSolrServer solrServer) throws SolrServerException {
        System.out.println("模糊查询");
        SolrQuery params = new SolrQuery("ent:通辽城投");
        QueryResponse response = solrServer.query(params);
        SolrDocumentList list = response.getResults();
        for (SolrDocument solrDocument : list) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent: " + solrDocument.getFieldValue("ent"));
        }
        System.out.println("精确查询");
        SolrQuery params2 = new SolrQuery("ent:\"通辽城投\"");
        QueryResponse response2 = solrServer.query(params2);
        SolrDocumentList list2 = response2.getResults();
        for (SolrDocument solrDocument : list2) {
            System.out.println("id: " + solrDocument.getFieldValue("id"));
            System.out.println("ent: " + solrDocument.getFieldValue("ent"));
        }
    }


}

批量查询数据

使用翻页

ublic static SolrDocumentList getBatchData(String dataString) throws SolrServerException {
        SolrDocumentList solrDocuments = new SolrDocumentList();
        long total = 0;

        SolrQuery params = new SolrQuery();
        params.set("q", "*:*");
        params.set("fq", String.format("inputtime: %s*", dataString));
        params.set("fl", "ID,inputtime,nerent,title,pubdate,TEXT");
        QueryResponse res = SolrUtils.getInstance().solrServer.query(params);
        if (!res.getResults().isEmpty()) {
            total = res.getResults().getNumFound();
        }
        if (total == 0) {
            LOGGER.info(String.format("当前日期没有需要处理的增量数据!日期:%s", dataString));
//            System.exit(0);
        } else {
            LOGGER.info(String.format("当前日期数据量:%d", total));
        }
        long batch = total / 1000 + 1;
        // 翻页查询所有数据
        for (int i = 0; i < batch; i++) {
            params.set("start", i * 1000);
            params.set("rows", 1000);
            QueryResponse batchRes = SolrUtils.getInstance().solrServer.query(params);
            if (!batchRes.getResults().isEmpty()) {
                solrDocuments.addAll(batchRes.getResults());
                LOGGER.info(String.format("日期 %s %d / %d", dataString, solrDocuments.size(), total));
            }
        }
        return solrDocuments;
    }

使用游标
分页读取的方式,在大数据量的情况下,在solr里面表现并不是特别好,因为它随时可能会发生OOM的异常,在solr里面
通过rows和start参数,非常方便分页读取,但是如果你的start=1000000 rows=10,那么solr里面会将前面100万元数据的索引信息读取在内存里面,这样以来,非常耗内存
游标是无状态的不会维护索引数据在内存里面,仅仅记录最后一个doc的计算值类似md5,然后每一次读取,都会如此记录最后一个值的mark,下一次通过这个mark便能快速的定位到第二页上,如此往复,便能完成整个数据的读取。而且耗费内存非常少
游标的缺点是游标一旦读取了,就不能再返回上一次的位置了,游标需要根据主键升序或者降序,并且不能重复。

public static SolrDocumentList getBatchData2(String dataString) throws SolrServerException {
        SolrDocumentList solrDocuments = new SolrDocumentList();
        long total = 0;

        SolrQuery params = new SolrQuery();
        params.set("q", "*:*");
        params.set("fq", String.format("inputtime: %s*", dataString));
        params.set("fl", "ID,inputtime,nerent,title,pubdate,TEXT");
        QueryResponse res = SolrUtils.getInstance().solrServer.query(params);
        if (!res.getResults().isEmpty()) {
            total = res.getResults().getNumFound();
        }
        if (total == 0) {
            LOGGER.info(String.format("当前日期没有需要处理的增量数据!日期:%s", dataString));
//            System.exit(0);
        } else {
            LOGGER.info(String.format("当前日期数据量:%d", total));
        }
        params.setRows(1000);
        params.setSort("ID", SolrQuery.ORDER.asc);
        String cursorMark = CursorMarkParams.CURSOR_MARK_START;
        boolean hasMore = true;
        while (hasMore) {
            params.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark);
            QueryResponse batchRes = SolrUtils.getInstance().solrServer.query(params);
            String nextCursorMark = batchRes.getNextCursorMark();
            if (!batchRes.getResults().isEmpty()) {
                solrDocuments.addAll(batchRes.getResults());
            }
            if (nextCursorMark.equals(cursorMark)) {
                hasMore = false;
            } else {
                cursorMark = nextCursorMark;
            }
        }
        return solrDocuments;
    }
上一篇 下一篇

猜你喜欢

热点阅读