(from github.com/zackhorvath)
Hi Josh,
For our environment, we created our own analyzer and filter set within /usr/share/fess/app/WEB-INF/classes/fess_indices/fess.json, and then replaced the mappings inside /usr/share/fess/app/WEB-INF/classes/fess_indices/fess/doc.json.
Our filters,
"fh_english_stemmer": {
"type": "stemmer",
"language": "english"
},
"fh_english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"fh_english_stop": {
"type": "stop",
"stopwords": "_english_"
}
And our analyzer,
"fh_standard": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"fh_english_stop",
"fh_english_possessive_stemmer",
"fh_english_stemmer"
]
}
leverage the default built-in ElasticSearch functions, and allow us to customize our stuff a little better without needing to directly modify Fess core configuration.
Our mapping doc.json now looks like the following:
{
"doc": {
"_source": {
"enabled": true
},
"dynamic_templates": [
{
"lang_en": {
"match": "*_en",
"mapping": {
"type": "text",
"analyzer": "fh_standard"
}
}
}
],
"properties": {
"anchor": {
"type": "keyword"
},
"boost": {
"type": "float"
},
"click_count": {
"type": "long"
},
"config_id": {
"type": "keyword"
},
"important_content": {
"type": "text",
"analyzer": "fh_standard"
},
"content": {
"type": "text",
"analyzer": "fh_standard",
"term_vector": "with_positions_offsets"
},
"content_minhash": {
"type": "minhash",
"minhash_analyzer": "minhash_analyzer",
"copy_bits_to": "content_minhash_bits"
},
"content_minhash_bits": {
"type": "keyword"
},
"content_length": {
"type": "long"
},
"created": {
"type": "date",
"format": "date_optional_time"
},
"timestamp": {
"type": "date",
"format": "date_optional_time"
},
"expires": {
"type": "date",
"format": "date_optional_time"
},
"digest": {
"type": "text",
"analyzer": "fh_standard"
},
"doc_id": {
"type": "keyword"
},
"favorite_count": {
"type": "long"
},
"filename": {
"type": "keyword"
},
"filetype": {
"type": "keyword"
},
"host": {
"type": "keyword"
},
"lang": {
"type": "keyword"
},
"last_modified": {
"type": "date",
"format": "date_optional_time"
},
"location": {
"type": "geo_point"
},
"mimetype": {
"type": "keyword"
},
"parent_id": {
"type": "keyword"
},
"role": {
"type": "keyword"
},
"label": {
"type": "keyword"
},
"virtual_host": {
"type": "keyword"
},
"segment": {
"type": "keyword"
},
"site": {
"type": "keyword"
},
"meta_template": {
"type": "keyword"
},
"meta_keywords": {
"type": "keyword"
},
"meta_event_date": {
"type": "keyword"
},
"meta_published_time": {
"type": "date",
"format": "date_optional_time"
},
"meta_last_mod_date": {
"type": "date",
"format": "epoch_second"
},
"title": {
"type": "text",
"analyzer": "fh_standard",
"term_vector": "with_positions_offsets"
},
"thumbnail": {
"type": "keyword"
},
"url": {
"type": "keyword"
}
}
}
}
We’ve got a lot of weird-ness going on from multiple data sources, so it made the most sense for our organization to break out of the default Fess analyzers. I don’t readily recommend it for smaller orgs or people looking for an out-of-the-box solution! We are also not taking advantage of the Fess API, and built our own solution for search queries - and I am also unsure of the effect that these changes have on other Fess functionality, like suggest, etc.