elasticsearch java client 自定义分词器
2023-07-05 本文已影响0人
lz做过前端
背景说明
- 如果直接生成索引,不做任何设置,使用的是默认的 keyword 类型,使用的是默认的 standard 的分词器。该分词器不支持对字母和数字分词。
- 所以我们需要在在生成索引之前对其进行设置
API 探索(两种配置方式)
生成索引之前设置
- API.2 删除索引
- API.4 全局设置
- API.7 插入数据
- API.11 查看配置
- API.12 模糊搜索测试
先生成索引后设置
- API.2 删除索引
- API.8 插入随意数据
- API.9 关闭索引
- API.5 配置
- API.6 设置
- API.10 打开索引
- API.3 删除随意数据
- API.11 查看配置
- API.12 模糊搜索测试
Java client
<properties>
<elasticsearch.new.version>8.7.1</elasticsearch.new.version>
</properties>
<!-- elasticsearch -->
<dependency>
<groupId>co.elastic.clients</groupId>
<artifactId>elasticsearch-java</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-client</artifactId>
<version>${elasticsearch.new.version}</version>
</dependency>
目前测试下来 Java client 只支持 先生成索引后设置,大致的代码如下:
ElasticsearchIndicesClient indicesClient = elasticsearchClient.indices();
@Data
@AllArgsConstructor
class Fund {
private String innerCode;
}
String deleteId = "pre_add_end_delete";
String analyzer = "my_ngram_analyzer";
String tokenizer = "ngram";
List<String> fields = List.of("secuCode", "secuAbbr");
configAnalyzer(indicesClient, targetName, deleteId, new Fund(deleteId), analyzer, tokenizer, fields);
@SneakyThrows
private void configAnalyzer(ElasticsearchIndicesClient indicesClient, String targetName, String deleteId, Object deleteDocument, String analyzer, String tokenizer, List<String> fields) {
elasticsearchClient.index(i -> i.index(targetName).id(deleteId).document(deleteDocument));
// close 否则不允许 setting
indicesClient.close(x -> {
x.index(List.of(targetName));
elasticSearchService.printDsl(CloseIndexRequest.class, CloseIndexRequest.Builder.class, x);
return x;
});
// 设置自定义分词器
indicesClient.putSettings(x -> {
x.index(targetName).settings(y -> y
.maxResultWindow(elasticSearchProperty.getDefaultFundSize())
.analysis(z -> z.analyzer(analyzer, a -> a.custom(b -> b.tokenizer(tokenizer)))));
elasticSearchService.printDsl(PutIndicesSettingsRequest.class, PutIndicesSettingsRequest.Builder.class, x);
return x;
});
// 将自定义分词器配置到字段上
indicesClient.putMapping(x -> {
x.index(List.of(targetName));
fields.forEach(f -> {
x.properties(f, Property.of(y -> y.text(z -> z
.analyzer(analyzer)
.fields(Property.Kind.Keyword.jsonValue(),a -> a.keyword(b -> b.ignoreAbove(256))))));
});
elasticSearchService.printDsl(PutMappingRequest.class, PutMappingRequest.Builder.class, x);
return x;
});
// 打开,否则不允许 修改数据
indicesClient.open(x -> {
x.index(List.of(targetName));
elasticSearchService.printDsl(OpenRequest.class, OpenRequest.Builder.class, x);
return x;
});
// 最后将初始化的数据删除
elasticsearchClient.delete(i -> i.index(targetName).id(deleteId));
}
客户端配置
@Data
@Component
@ConfigurationProperties(prefix = "elastic-search")
public class ElasticSearchProperty implements Serializable {
private String hostname;
private int port;
private int connectTimeout;
private int socketTimeout;
private int defaultFundSize = 9999;
private int defaultManagerSize = 9999;
private int limitSize = 200 * 1024 * 1024;
}
@Bean
public ElasticsearchClient buildElasticsearchClient(ElasticSearchProperty property) {
HttpHost httpHost = new HttpHost(property.getHostname(), property.getPort());
// https://stackoverflow.com/questions/71142680/co-elastic-clients-transport-transportexception-es-search-missing-x-elastic
// The default headers the RestClientBuilder allows you to specify are the request headers, not the response headers. The error you are getting is because older Elasticsearch [server] versions do not include the X-Elastic-Product=Elasticsearch header in any of the API responses, but the recent distributions do (7.14+?), so the newer versions of elasticsearch-java (i.e. client) expects them.
RestClientBuilder.HttpClientConfigCallback httpClientConfigCallback = httpClientBuilder ->
httpClientBuilder
.setDefaultRequestConfig(RequestConfig.custom()
.setConnectTimeout(property.getConnectTimeout())
.setSocketTimeout(property.getSocketTimeout())
.build())
// java.io.IOException: Connection reset by peer
// https://cloud.tencent.com/developer/article/1943055
.setDefaultIOReactorConfig(IOReactorConfig.custom()
.setSoKeepAlive(true)
.build())
.setDefaultCredentialsProvider(new BasicCredentialsProvider())
.setDefaultHeaders(List.of(new BasicHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString())))
.addInterceptorLast((HttpResponseInterceptor) (response, context) -> response.addHeader("X-Elastic-Product", "Elasticsearch"));
var restClient = RestClient.builder(httpHost)
.setHttpClientConfigCallback(httpClientConfigCallback)
.build();
JacksonJsonpMapper mapper = new JacksonJsonpMapper();
ObjectMapper objectMapper = mapper.objectMapper();
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
// LocalDate LocalDateTime 解析器
objectMapper.registerModule(new JavaTimeModule());
// default:/index_name/_search?typed_keys=true => /index_name/_search?typed_keys=false
// Elasticsearch search requests accept a typed_key parameter that allow returning type information along with the name in aggregation and suggestion results (see the aggregations documentation for additional details).
// However, in some use cases serializing objects in the typed_keys format may not be desirable, for example when the Java API Client is used in an application that acts as a front-end to other services that expect the default format for aggregations and suggestions.
// You can disable typed_keys serialization by setting the JsonpMapperFeatures.SERIALIZE_TYPED_KEYS attribute to false on your mapper object:
// mapper.withAttribute(JsonpMapperFeatures.SERIALIZE_TYPED_KEYS, false);
// result:"\"aggregations\":{\"avg#price\":{\"value\":3.14}}}") VS "\"aggregations\":{\"price\":{\"value\":3.14}}}"
// avg returned
ElasticsearchTransport transport = new RestClientTransport(restClient, mapper);
// bufferLimit
RequestOptions.Builder requestOptionsBuilder = RequestOptions.DEFAULT.toBuilder();
requestOptionsBuilder.setHttpAsyncResponseConsumerFactory(new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(property.getLimitSize()));
RestClientOptions myOptions = new RestClientOptions(requestOptionsBuilder.build());
return new ElasticsearchClient(transport, myOptions);
}
API
- 查看数据
get /person/_search
- 删除索引
delete /person
- 删除其中一条数据
delete /person/_doc/pre_add_end_delete_id
- 索引生成前对索引进行设置(自定义分词器+对字段进行设置)
PUT /person
{
"settings": {
"index": {
"max_result_window": 50000
},
"analysis": {
"analyzer": {
"my_ngram_analyzer":{
"type":"custom",
"char_filter":[
"my_html_strip",
"my_punctuation_mapping"
],
"tokenizer": "my_tokenizer",
"filter": ["my_stop_token_filter"]
}
},
"char_filter": {
"my_punctuation_mapping":{
"type":"mapping",
"mappings":["* => _","= => ~"]
},
"my_html_strip":{
"type":"html_strip"
}
},
"tokenizer": {
"my_tokenizer":{
"type":"ngram"
}
},
"filter": {
"my_stop_token_filter":{
"type":"stop",
"ignore_case":true,
"stopwords": ["is","a","the"]
}
}
}
},
"mappings": {
"properties": {
"first_name" : {
"type": "text",
"analyzer": "my_ngram_analyzer"
},
"interests" : {
"type" : "text",
"analyzer": "my_ngram_analyzer",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
- 先有索引在进行设置(简化版本)
PUT /person/_settings
{
"analysis": {
"analyzer": {
"my_ngram_analyzer":{
"type":"custom",
"char_filter":[],
"tokenizer": "ngram",
"filter": []
}
}
}
}
- 先有索引在进行设置(对字段设置)
POST /person/_mapping
{
"properties": {
"first_name" : {
"type": "text",
"analyzer": "my_ngram_analyzer"
},
"interests" : {
"type" : "text",
"analyzer": "my_ngram_analyzer",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
- 插入测试数据
PUT /person/_doc/1
{
"first_name" : "400001.OF",
"last_name" : "Smith",
"age" : 25,
"about" : "I love to go rock bbb",
"about_true" : "I love to go rock bbb",
"interests": [ "400001.OF", "music" ],
"someday": "1989-02-09"
}
PUT /person/_doc/2
{
"first_name" : "付水电费水电费",
"last_name" : "Smith",
"age" : 25,
"about" : "I love to go rock bbb",
"about_true" : "I love to go rock bbb",
"interests": [ "400001.OF", "music" ],
"someday": "1989-02-09"
}
- 插入非设置字段数据
PUT /person/_doc/pre_add_end_delete_id
{
"age" : 25
}
- 关闭索引
POST /person/_close
- 打开索引
POST /person/_open
- 查看索引设置
GET person/_settings
GET /person/_mapping
- 模糊搜索
POST /person/_search
{
"query": {
"match_phrase": {
"first_name": "001"
}
}
}
POST /person/_search
{
"query": {
"match_phrase": {
"first_name": "费水"
}
}
}
POST /person/_search
{
"query": {
"match_phrase": {
"interests": "001"
}
}
}