Elasticsearch 分词
2019-03-28 本文已影响0人
歌哥居士
POST _analyze
{
"tokenizer": "standard",
"filter": ["lowercase"],
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# analyzer = char_filter + tokenizer + token filter
# standard anylyzer = {
# standard tokenizer
# +
# standard lower filter
# lower case filter
# stop filter (默认禁用)
# }
POST _analyze
{
"analyzer": "standard",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# simple analyzer = {
# lower case tokenizer
# }
POST _analyze
{
"analyzer": "simple",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# whitespace analyzer = {
# whitespace tokenizer
# }
POST _analyze
{
"analyzer": "whitespace",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# stop analyzer = {
# lower case tokenizer
# +
# stop filter
# }
POST _analyze
{
"analyzer": "stop",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# keyword analyzer = {
# keyword tokenizer
# }
POST _analyze
{
"analyzer": "keyword",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
# pattern analyzer = {
# pattern tokenizer
# +
# lower case filter
# stop filter (默认禁用)
# }
# 默认是\W+
POST _analyze
{
"analyzer": "pattern",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
中文分词
- elasticsearch-analysis-ik
- jieba
- Hanlp
- THULAC
自定义分词
PUT test_index2
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"char_filter": [
"emoticons"
],
"tokenizer": "punctuation",
"filter": [
"lowercase",
"english_stop"
]
}
},
"char_filter": {
"emoticons": {
"type": "mapping",
"mappings": [
":) => _happy_",
":( => _sad_"
]
}
},
"tokenizer": {
"punctuation": {
"type": "pattern",
"pattern": "[.,!?]"
}
},
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
}
}
}
}
}
POST test_index2/_analyze
{
"analyzer": "my_custom_analyzer",
"text": "I'm a :) person, and you"
}