hadoop入门系列--使用hbase过滤器(一篇全掌握)

2020-04-15  本文已影响0人  微生活_小阿楠

传送门
hadoop入门系列--hbase基础知识点
hadoop入门系列--从本地把数据导入Hbase
hadoop入门系列--用java代码实现创建hbase表
hadoop入门系列--使用hbase过滤器(一篇全掌握)
传送门

1)1.BinaryComparator()与SubstringComparator()区别

概念简述:

  • BinaryComparator按字节索引顺序比较指定字节数组,采用Bytes.compareTo(byte[])
  • SubstringComparator判断提供的子串是否出现在value中

总结:BinaryComparator一般用于已经确定好,肯定一模一样的(比如列名、列限定符),SubstringComparator一般用于模糊匹配(也可以说是有包含就匹配)(比如值)

注意:这里的代码完全是由题主亲自验证过的,题目由简到难。基本上每一道题目都用到了2个及其以上的过滤器,属于多条件过滤。

练习: 请按照以下要求完成查询任务:

  • (1)请查询出属于“互联网”产业的公司的职位名称;
  • (2)请查询出学历要求是“硕士”的职位信息;
  • (3)请分页(每页2条)查询出职位为“机器学习”的职位信息(查2页);
  • (4)请查询出“北京”或“上海”薪水在“10k-20k”之间的职位信息;
  • (5)请查询出“北京”的公司规模在“100人”以上的职位信息;

2)话不多说,直接上代码

package hbase_put_scan;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FamilyFilter;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.QualifierFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.filter.SubstringComparator;
import org.apache.hadoop.hbase.filter.ValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
 

public class HbaseFilterTest {
    public static Configuration conf;
    public static Connection conn;
    
    static {
        //1.获取资源
        conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.property", "2181");
        conf.set("hbasae.zookeeper.quorum", "centos");
        conf.set("hbase.master", "centos:60000");
        try {
            //2.创建连接
            conn = ConnectionFactory.createConnection(conf);
        }catch(IOException e) {
            e.printStackTrace();
        }
    }
    //(1)请查询出属于“互联网”产业的公司的职位名称; 
    public static void scanColumnFamily(String Table,String Family,String company_industry,String qualifier,String job_name) throws IOException{
        Admin admin = conn.getAdmin();
        //3.依据指定表名建立table实例
        Table table = conn.getTable(TableName.valueOf(Table));
        //4.建立scan实例
        Scan scan = new Scan();
        //单列值过滤器
        SingleColumnValueFilter singleColumnValueFilter = new SingleColumnValueFilter(
                Family.getBytes(),
                company_industry.getBytes(),
                CompareOp.EQUAL,
                new SubstringComparator(qualifier)
                );  
        //列过滤器
        Filter qualifierFilter = new QualifierFilter(CompareOp.EQUAL,new BinaryComparator(job_name.getBytes()));
        //把上面两个过滤器and
        FilterList filterList = new FilterList();
        filterList.addFilter(singleColumnValueFilter);
        filterList.addFilter(qualifierFilter);
        
        scan.setFilter(filterList);
        ResultScanner resultScanner = table.getScanner(scan);
        //5.遍历读取ResultScanner集中内容
        int a = 0;
        for(Result result : resultScanner) {
            //System.out.println(new String(result.getValue(Family.getBytes(), job_name.getBytes())));
            //遍历读取result集中的内容
            List<Cell> cells = result.listCells();
            for(Cell cell : cells) {a++;
                System.out.print("行健:" + new String(CellUtil.cloneRow(cell) ));
                System.out.print("列族:" + new String(CellUtil.cloneFamily(cell) ));
                System.out.print("列:" + new String(CellUtil.cloneQualifier(cell) ));
                System.out.println("值:" + new String(CellUtil.cloneValue(cell) ));
            }
            System.out.println("============================================="+ a);
        }
        //6.关闭打开的资源
        resultScanner.close();
        table.close();
        conn.close();
    }
    
    //(2)请查询出学历要求是“硕士”的职位信息;
    public static void job_info(String Table,String family,String job_edu_require,String qualifier,String job_info) throws IOException{
        Admin admin = conn.getAdmin();
        Table table = conn.getTable(TableName.valueOf(Table));
        
        Scan scan = new Scan();
        //单列值过滤器
        SingleColumnValueFilter singleColumnValueFilter = new SingleColumnValueFilter(
                family.getBytes(),
                job_edu_require.getBytes(),
                CompareOp.EQUAL,
                new SubstringComparator(qualifier)
                );

        scan.setFilter(singleColumnValueFilter);
        ResultScanner resultScanner = table.getScanner(scan);
        int a = 0;  
        for(Result result : resultScanner) {a++; 
            //方法一
            System.out.println("RowKey:" + new String(result.getRow()) + " qualifier=" + new String(result.getValue(family.getBytes(),job_info.getBytes())));
            System.out.println("========================");
            //方法二   (参考第一道题)        
        }       System.out.println(a);
    }
             
    //(3)请查询出“北京”或“上海”薪水在“10k-20k”之间的职位信息;
    public static void company_job_info() throws IOException{
        Admin admin = conn.getAdmin();
        //3.依据指定表名建立table实例
        Table table = conn.getTable(TableName.valueOf("jobs"));
        //4.建立scan实例
        Scan scan = new Scan();
        //单列值过滤器--薪水在“10k-20k”
        SingleColumnValueFilter singleColumnValueFilter_salary = new SingleColumnValueFilter("info".getBytes(),"job_salary".getBytes(),CompareOp.EQUAL,new SubstringComparator("10k-20k")); 
        //单列值过滤器--北京
        SingleColumnValueFilter singleColumnValueFilter1 = new SingleColumnValueFilter("info".getBytes(),"company_location".getBytes(),CompareOp.EQUAL,new SubstringComparator("北京")); 
        //单列值过滤器--上海
        SingleColumnValueFilter singleColumnValueFilter2 = new SingleColumnValueFilter("info".getBytes(),"company_location".getBytes(),CompareOp.EQUAL,new SubstringComparator("上海")); 
        //列过滤器--只获取所有职位信息
        Filter qualifierFilter = new QualifierFilter(CompareOp.EQUAL,new SubstringComparator("job_"));
        Filter qualifierFilter_companylocation = new QualifierFilter(CompareOp.EQUAL,new BinaryComparator("company_location".getBytes()));
        
        //公司地址的组合--北京or上海
        FilterList filterList_location = new FilterList(FilterList.Operator.MUST_PASS_ONE);
        filterList_location.addFilter(singleColumnValueFilter1);
        filterList_location.addFilter(singleColumnValueFilter2);
 
        //查询列的组合(可以把你想要输出的列写在这里)
        FilterList filterList_column = new FilterList(FilterList.Operator.MUST_PASS_ONE);
        filterList_column.addFilter(qualifierFilter);
        filterList_column.addFilter(qualifierFilter_companylocation);
        
        //把上面所有过滤器and
        FilterList filterList = new FilterList();
        filterList.addFilter(singleColumnValueFilter_salary);
        filterList.addFilter(filterList_location);    
        filterList.addFilter(filterList_column);
        
        scan.setFilter(filterList);
        ResultScanner resultScanner = table.getScanner(scan);
        //5.遍历读取ResultScanner集中内容   
        for(Result result : resultScanner) {
            //System.out.println(new String(result.getValue(Family.getBytes(), job_name.getBytes())));
            //遍历读取result集中的内容
            List<Cell> cells = result.listCells();
            for(Cell cell : cells) {
                System.out.print("行健:" + new String(CellUtil.cloneRow(cell) ));
                System.out.print("列族:" + new String(CellUtil.cloneFamily(cell) ));
                System.out.print("列:" + new String(CellUtil.cloneQualifier(cell) ));
                System.out.println("值:" + new String(CellUtil.cloneValue(cell) ));
            }
            System.out.println("=============================================");
        }
        //6.关闭打开的资源
        resultScanner.close();
        table.close();
        conn.close();
    }
    
    //(4)请查询出“北京”的公司规模在“100人”以上的职位信息;
    public static void company_people_job_info() throws IOException{
        Admin admin = conn.getAdmin();
        //3.依据指定表名建立table实例
        Table table = conn.getTable(TableName.valueOf("jobs"));
        //4.建立scan实例
        Scan scan = new Scan();
        //单列值过滤器--北京
        SingleColumnValueFilter singleColumnValueFilter = new SingleColumnValueFilter(
                "info".getBytes(),
                "company_location".getBytes(),
                CompareOp.EQUAL,
                new SubstringComparator("北京")
                );  
        //单列值过滤器--公司规模在“100人”以上
        SingleColumnValueFilter singleColumnValueFilter_people = new SingleColumnValueFilter("info".getBytes(),"company_people".getBytes(),CompareOp.EQUAL,new SubstringComparator("00")); 

        
        //把上面两个过滤器and
        FilterList filterList = new FilterList();
        filterList.addFilter(singleColumnValueFilter);
        filterList.addFilter(singleColumnValueFilter_people);
        
        scan.setFilter(filterList);
        ResultScanner resultScanner = table.getScanner(scan);
        //5.遍历读取ResultScanner集中内容
        int a = 0;
        for(Result result : resultScanner) {
            //System.out.println(new String(result.getValue(Family.getBytes(), job_name.getBytes())));
            //遍历读取result集中的内容
            List<Cell> cells = result.listCells();
            for(Cell cell : cells) {a++;
                System.out.print("行健:" + new String(CellUtil.cloneRow(cell) ));
                System.out.print("列族:" + new String(CellUtil.cloneFamily(cell) ));
                System.out.print("列:" + new String(CellUtil.cloneQualifier(cell) ));
                System.out.println("值:" + new String(CellUtil.cloneValue(cell) ));
            }
            System.out.println("============================================="+ a);
        }
        //6.关闭打开的资源
        resultScanner.close();
        table.close();
        conn.close();
    }
    
    
    public static void main(String[] args) throws IOException{  
        //(1)请查询出属于“互联网”产业的公司的职位名称
        //scanColumnFamily("jobs","info","company_industry","互联网","job_name");
        
        //(2)请查询出学历要求是“硕士”的职位信息;(下面是做法一、做法二);   
        //job_info("jobs","info","job_edu_require","硕士","job_info");        
        //scanColumnFamily("jobs","info","job_edu_require","硕士","job_info");
        
        //(3)请查询出“北京”或“上海”薪水在“10k-20k”之间的职位信息;
        company_job_info();
        
        //(4)请查询出“北京”的公司规模在“100人”以上的职位信息;
        //company_people_job_info();
    }
    
}

image.png
上一篇下一篇

猜你喜欢

热点阅读