elasticsearch java client 自定义分词器

API 探索(两种配置方式)


Java client


<!-- elasticsearch -->

目前测试下来 Java client 只支持 先生成索引后设置,大致的代码如下:

ElasticsearchIndicesClient indicesClient = elasticsearchClient.indices();
class Fund {
    private String innerCode;
String deleteId = "pre_add_end_delete";
String analyzer = "my_ngram_analyzer";
String tokenizer = "ngram";
List<String> fields = List.of("secuCode", "secuAbbr");
configAnalyzer(indicesClient, targetName, deleteId, new Fund(deleteId), analyzer, tokenizer, fields);

private void configAnalyzer(ElasticsearchIndicesClient indicesClient, String targetName, String deleteId, Object deleteDocument, String analyzer, String tokenizer, List<String> fields) {
    elasticsearchClient.index(i -> i.index(targetName).id(deleteId).document(deleteDocument));

    // close 否则不允许 setting
    indicesClient.close(x -> {
        elasticSearchService.printDsl(CloseIndexRequest.class, CloseIndexRequest.Builder.class, x);
        return x;

    // 设置自定义分词器
    indicesClient.putSettings(x -> {
        x.index(targetName).settings(y -> y
                .analysis(z -> z.analyzer(analyzer, a -> a.custom(b -> b.tokenizer(tokenizer)))));
        elasticSearchService.printDsl(PutIndicesSettingsRequest.class, PutIndicesSettingsRequest.Builder.class, x);
        return x;

    // 将自定义分词器配置到字段上
    indicesClient.putMapping(x -> {
        fields.forEach(f -> {
            x.properties(f, Property.of(y -> y.text(z -> z
                    .fields(Property.Kind.Keyword.jsonValue(),a -> a.keyword(b -> b.ignoreAbove(256))))));
        elasticSearchService.printDsl(PutMappingRequest.class, PutMappingRequest.Builder.class, x);
        return x;

    // 打开,否则不允许 修改数据
    indicesClient.open(x -> {
        elasticSearchService.printDsl(OpenRequest.class, OpenRequest.Builder.class, x);
        return x;

    // 最后将初始化的数据删除
    elasticsearchClient.delete(i -> i.index(targetName).id(deleteId));


@ConfigurationProperties(prefix = "elastic-search")
public class ElasticSearchProperty implements Serializable {
    private String hostname;
    private int port;

    private int connectTimeout;

    private int socketTimeout;

    private int defaultFundSize = 9999;

    private int defaultManagerSize = 9999;

    private int limitSize = 200 * 1024 * 1024;

public ElasticsearchClient buildElasticsearchClient(ElasticSearchProperty property) {
    HttpHost httpHost = new HttpHost(property.getHostname(), property.getPort());

    // https://stackoverflow.com/questions/71142680/co-elastic-clients-transport-transportexception-es-search-missing-x-elastic
    // The default headers the RestClientBuilder allows you to specify are the request headers, not the response headers. The error you are getting is because older Elasticsearch [server] versions do not include the X-Elastic-Product=Elasticsearch header in any of the API responses, but the recent distributions do (7.14+?), so the newer versions of elasticsearch-java (i.e. client) expects them.
    RestClientBuilder.HttpClientConfigCallback httpClientConfigCallback = httpClientBuilder ->
                    // java.io.IOException: Connection reset by peer
                    // https://cloud.tencent.com/developer/article/1943055
                    .setDefaultCredentialsProvider(new BasicCredentialsProvider())
                    .setDefaultHeaders(List.of(new BasicHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString())))
                    .addInterceptorLast((HttpResponseInterceptor) (response, context) -> response.addHeader("X-Elastic-Product", "Elasticsearch"));
    var restClient = RestClient.builder(httpHost)
    JacksonJsonpMapper mapper = new JacksonJsonpMapper();
    ObjectMapper objectMapper = mapper.objectMapper();
    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);

    // LocalDate LocalDateTime 解析器
    objectMapper.registerModule(new JavaTimeModule());

    // default:/index_name/_search?typed_keys=true => /index_name/_search?typed_keys=false
    // Elasticsearch search requests accept a typed_key parameter that allow returning type information along with the name in aggregation and suggestion results (see the aggregations documentation for additional details).
    // However, in some use cases serializing objects in the typed_keys format may not be desirable, for example when the Java API Client is used in an application that acts as a front-end to other services that expect the default format for aggregations and suggestions.
    // You can disable typed_keys serialization by setting the JsonpMapperFeatures.SERIALIZE_TYPED_KEYS attribute to false on your mapper object:
    // mapper.withAttribute(JsonpMapperFeatures.SERIALIZE_TYPED_KEYS, false);
    // result:"\"aggregations\":{\"avg#price\":{\"value\":3.14}}}") VS "\"aggregations\":{\"price\":{\"value\":3.14}}}"
    // avg returned
    ElasticsearchTransport transport = new RestClientTransport(restClient, mapper);

    // bufferLimit
    RequestOptions.Builder requestOptionsBuilder = RequestOptions.DEFAULT.toBuilder();
    requestOptionsBuilder.setHttpAsyncResponseConsumerFactory(new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(property.getLimitSize()));
    RestClientOptions myOptions = new RestClientOptions(requestOptionsBuilder.build());

    return new ElasticsearchClient(transport, myOptions);


  1. 查看数据
get /person/_search
  1. 删除索引
delete /person
  1. 删除其中一条数据
delete /person/_doc/pre_add_end_delete_id
  1. 索引生成前对索引进行设置(自定义分词器+对字段进行设置)
PUT /person
  "settings": {
    "index": {
      "max_result_window": 50000
    "analysis": {
      "analyzer": {
          "tokenizer": "my_tokenizer",
          "filter": ["my_stop_token_filter"]
      "char_filter": {
          "mappings":["* => _","= => ~"]
      "tokenizer": {
      "filter": {
          "stopwords": ["is","a","the"]
  "mappings": {
    "properties": {
        "first_name" : {
          "type": "text",
          "analyzer": "my_ngram_analyzer"
        "interests" : {
          "type" : "text",
          "analyzer": "my_ngram_analyzer",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
  1. 先有索引在进行设置(简化版本)
PUT /person/_settings
  "analysis": {
    "analyzer": {
        "tokenizer": "ngram",
        "filter": []
  1. 先有索引在进行设置(对字段设置)
POST /person/_mapping
  "properties": {
      "first_name" : {
        "type": "text",
        "analyzer": "my_ngram_analyzer"
      "interests" : {
        "type" : "text",
        "analyzer": "my_ngram_analyzer",
        "fields" : {
          "keyword" : {
            "type" : "keyword",
            "ignore_above" : 256
  1. 插入测试数据
PUT /person/_doc/1
    "first_name" : "400001.OF",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock bbb",
    "about_true" :      "I love to go rock bbb",
    "interests": [ "400001.OF", "music" ],
    "someday": "1989-02-09"

PUT /person/_doc/2
    "first_name" : "付水电费水电费",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock bbb",
    "about_true" :      "I love to go rock bbb",
    "interests": [ "400001.OF", "music" ],
    "someday": "1989-02-09"
  1. 插入非设置字段数据
PUT /person/_doc/pre_add_end_delete_id
    "age" : 25
  1. 关闭索引
POST /person/_close
  1. 打开索引
POST /person/_open
  1. 查看索引设置
GET person/_settings
GET /person/_mapping
  1. 模糊搜索
POST /person/_search
  "query": {
    "match_phrase": {
      "first_name": "001"

POST /person/_search
  "query": {
    "match_phrase": {
      "first_name": "费水"

POST /person/_search
  "query": {
    "match_phrase": {
      "interests": "001"
