elasticsearch java client 自定义分词器
2023-07-05 本文已影响0人
- 如果直接生成索引,不做任何设置,使用的是默认的 keyword 类型,使用的是默认的 standard 的分词器。该分词器不支持对字母和数字分词。
- 所以我们需要在在生成索引之前对其进行设置
API 探索(两种配置方式)
- API.2 删除索引
- API.4 全局设置
- API.7 插入数据
- API.11 查看配置
- API.12 模糊搜索测试
- API.8 插入随意数据
- API.9 关闭索引
- API.5 配置
- API.6 设置
- API.10 打开索引
- API.3 删除随意数据
Java client
<!-- elasticsearch -->
目前测试下来 Java client 只支持 先生成索引后设置,大致的代码如下:
ElasticsearchIndicesClient indicesClient = elasticsearchClient.indices();
class Fund {
private String innerCode;
String deleteId = "pre_add_end_delete";
String analyzer = "my_ngram_analyzer";
String tokenizer = "ngram";
List<String> fields = List.of("secuCode", "secuAbbr");
configAnalyzer(indicesClient, targetName, deleteId, new Fund(deleteId), analyzer, tokenizer, fields);
private void configAnalyzer(ElasticsearchIndicesClient indicesClient, String targetName, String deleteId, Object deleteDocument, String analyzer, String tokenizer, List<String> fields) {
elasticsearchClient.index(i -> i.index(targetName).id(deleteId).document(deleteDocument));
// close 否则不允许 setting
indicesClient.close(x -> {
elasticSearchService.printDsl(CloseIndexRequest.class, CloseIndexRequest.Builder.class, x);
return x;
// 设置自定义分词器
indicesClient.putSettings(x -> {
x.index(targetName).settings(y -> y
.analysis(z -> z.analyzer(analyzer, a -> a.custom(b -> b.tokenizer(tokenizer)))));
elasticSearchService.printDsl(PutIndicesSettingsRequest.class, PutIndicesSettingsRequest.Builder.class, x);
return x;
// 将自定义分词器配置到字段上
indicesClient.putMapping(x -> {
fields.forEach(f -> {
x.properties(f, Property.of(y -> y.text(z -> z
.fields(Property.Kind.Keyword.jsonValue(),a -> a.keyword(b -> b.ignoreAbove(256))))));
elasticSearchService.printDsl(PutMappingRequest.class, PutMappingRequest.Builder.class, x);
return x;
// 打开,否则不允许 修改数据
indicesClient.open(x -> {
elasticSearchService.printDsl(OpenRequest.class, OpenRequest.Builder.class, x);
return x;
// 最后将初始化的数据删除
elasticsearchClient.delete(i -> i.index(targetName).id(deleteId));
@ConfigurationProperties(prefix = "elastic-search")
public class ElasticSearchProperty implements Serializable {
private String hostname;
private int port;
private int connectTimeout;
private int socketTimeout;
private int defaultFundSize = 9999;
private int defaultManagerSize = 9999;
private int limitSize = 200 * 1024 * 1024;
public ElasticsearchClient buildElasticsearchClient(ElasticSearchProperty property) {
HttpHost httpHost = new HttpHost(property.getHostname(), property.getPort());
// https://stackoverflow.com/questions/71142680/co-elastic-clients-transport-transportexception-es-search-missing-x-elastic
// The default headers the RestClientBuilder allows you to specify are the request headers, not the response headers. The error you are getting is because older Elasticsearch [server] versions do not include the X-Elastic-Product=Elasticsearch header in any of the API responses, but the recent distributions do (7.14+?), so the newer versions of elasticsearch-java (i.e. client) expects them.
RestClientBuilder.HttpClientConfigCallback httpClientConfigCallback = httpClientBuilder ->
// java.io.IOException: Connection reset by peer
// https://cloud.tencent.com/developer/article/1943055
.setDefaultCredentialsProvider(new BasicCredentialsProvider())
.setDefaultHeaders(List.of(new BasicHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString())))
.addInterceptorLast((HttpResponseInterceptor) (response, context) -> response.addHeader("X-Elastic-Product", "Elasticsearch"));
var restClient = RestClient.builder(httpHost)
JacksonJsonpMapper mapper = new JacksonJsonpMapper();
ObjectMapper objectMapper = mapper.objectMapper();
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
// LocalDate LocalDateTime 解析器
objectMapper.registerModule(new JavaTimeModule());
// default:/index_name/_search?typed_keys=true => /index_name/_search?typed_keys=false
// Elasticsearch search requests accept a typed_key parameter that allow returning type information along with the name in aggregation and suggestion results (see the aggregations documentation for additional details).
// However, in some use cases serializing objects in the typed_keys format may not be desirable, for example when the Java API Client is used in an application that acts as a front-end to other services that expect the default format for aggregations and suggestions.
// You can disable typed_keys serialization by setting the JsonpMapperFeatures.SERIALIZE_TYPED_KEYS attribute to false on your mapper object:
// mapper.withAttribute(JsonpMapperFeatures.SERIALIZE_TYPED_KEYS, false);
// result:"\"aggregations\":{\"avg#price\":{\"value\":3.14}}}") VS "\"aggregations\":{\"price\":{\"value\":3.14}}}"
// avg returned
ElasticsearchTransport transport = new RestClientTransport(restClient, mapper);
// bufferLimit
RequestOptions.Builder requestOptionsBuilder = RequestOptions.DEFAULT.toBuilder();
requestOptionsBuilder.setHttpAsyncResponseConsumerFactory(new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(property.getLimitSize()));
RestClientOptions myOptions = new RestClientOptions(requestOptionsBuilder.build());
return new ElasticsearchClient(transport, myOptions);
- 查看数据
get /person/_search
- 删除索引
delete /person
- 删除其中一条数据
delete /person/_doc/pre_add_end_delete_id
- 索引生成前对索引进行设置(自定义分词器+对字段进行设置)
PUT /person
"settings": {
"index": {
"max_result_window": 50000
"analysis": {
"analyzer": {
"tokenizer": "my_tokenizer",
"filter": ["my_stop_token_filter"]
"char_filter": {
"mappings":["* => _","= => ~"]
"tokenizer": {
"filter": {
"stopwords": ["is","a","the"]
"mappings": {
"properties": {
"first_name" : {
"type": "text",
"analyzer": "my_ngram_analyzer"
"interests" : {
"type" : "text",
"analyzer": "my_ngram_analyzer",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
- 先有索引在进行设置(简化版本)
PUT /person/_settings
"analysis": {
"analyzer": {
"tokenizer": "ngram",
"filter": []
- 先有索引在进行设置(对字段设置)
POST /person/_mapping
"properties": {
"first_name" : {
"type": "text",
"analyzer": "my_ngram_analyzer"
"interests" : {
"type" : "text",
"analyzer": "my_ngram_analyzer",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
- 插入测试数据
PUT /person/_doc/1
"first_name" : "400001.OF",
"last_name" : "Smith",
"age" : 25,
"about" : "I love to go rock bbb",
"about_true" : "I love to go rock bbb",
"interests": [ "400001.OF", "music" ],
"someday": "1989-02-09"
PUT /person/_doc/2
"first_name" : "付水电费水电费",
"last_name" : "Smith",
"age" : 25,
"about" : "I love to go rock bbb",
"about_true" : "I love to go rock bbb",
"interests": [ "400001.OF", "music" ],
"someday": "1989-02-09"
- 插入非设置字段数据
PUT /person/_doc/pre_add_end_delete_id
"age" : 25
- 关闭索引
POST /person/_close
- 打开索引
POST /person/_open
- 查看索引设置
GET person/_settings
GET /person/_mapping
- 模糊搜索
POST /person/_search
"query": {
"match_phrase": {
"first_name": "001"
POST /person/_search
"query": {
"match_phrase": {
"first_name": "费水"
POST /person/_search
"query": {
"match_phrase": {
"interests": "001"