env: ElasticSearch 5.5.1
First there are two indexs in my elasticsearch and the only different of two index is the message field, the field's type of message in index1 is keyword, and in index2 is text.
To ensure that it is not affected by other fields,I remove the message field and compare before and after result:
Before remove message field:

after remove message field i got:

Obvious the message field takes up a lot of space,and the type of keyword take up much more than text,but I don't know why keyword take up much more size than text? so, is there anyone help me ?
Following is the index of index1's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and settings info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index1",
"creation_date": "1531389785215",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "zd8oVbwUQbys1UJ8hJZRmQ",
"version": {
"created": "5050099"
}
}
}
Following is the index of index2's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"CommunicationReturnCode": {
"type": "keyword"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"action": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "text"
},
"msgid": {
"type": "keyword"
},
"msgname": {
"type": "keyword"
},
"nodetype": {
"type": "keyword"
},
"orgid": {
"type": "keyword"
},
"orgname": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"processingState": {
"type": "keyword"
},
"processingStatecode": {
"type": "keyword"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"thread": {
"type": "keyword"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"transDescription": {
"type": "keyword"
},
"transactionErrorCode": {
"type": "keyword"
},
"transactionTimeConsuming": {
"type": "keyword"
},
"transcode": {
"type": "keyword"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and setting info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "2",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index2",
"creation_date": "1531467294314",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "yROU2MrMTzip4VXH_zWEXQ",
"version": {
"created": "5050099"
}
}
}
Following are one of the index's file structure of the two shards about the text type field:
and the keyword type field:

And you can believe that there are same number of documents in two folder, and the only difference of the field is the type of message field.
Could you explain it? Thank you so much!
In Elasticsearch keyword fields have doc_values enabled by default, while text fields does not. This means that on your keyword fields it will store the whole field in a column-oriented fashion, in order to be able to perform aggregations or sorting, without relying on fielddata.
Also, Once you tokenize a string, with stemming, lowercasing, etc, you can achieve much better compression.
You can try to disable doc_values on that field if you don't perform aggregations or sorting on it.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With