# Configuration of language processing (included by cf/sherlock)

######## Language processing basics #############################################

Lang {

# List of known languages (names according to RFC 1766) together with their aliases
# Format:	Language { Name=en; Alias=us uk au }
Language	en
Language	cs cz
Language	sk
Language	pl
Language	hu
Language	de
Language	nl
Language	fr
Language	es
Language	it
Language	ru

}

#ifdef CONFIG_LANG

######## Stemmers and synonymic dictionaries ####################################

Stemmers {

# List of stemming rules (languages, algorithm, optional parameters)
# Format: Stemmer	{ Languages="cs sk"; Algorithm=name; Params="(usually) filename" }
# Note: Languages="*" means all languages

# Porter's stemming algorithm for English
#Stemmer		en	porter
# A simple table-driven stemmer, tables are generated by stem-table-gen, see comments there
#Stemmer		"cs sk"	table	dict/cs/cs.tab
# A simple compressed dictionary-driven stemmer, tables are generated by stem-dict-gen, see comments there
#Stemmer		cs	dict	dict/cs/cs.dict

# List of synonymic dictionaries (languages, file)
# These are text files, lines correspond to synonymic classes (not necessarily forming
# an equivalence), words are separated by colons.
# Format: SynDict	{ Languages="cs sk"; Name=filename }
#SynDict		cs	dict/cs/synonyma

}

######## Computing the tables of the language detector ##########################

LangTables {

# Print debugging information?  (2 is very verbose.)
Trace		0
# Number of documents to update progress indicator after (0=off)
Progress	0

# Filter to be used by lang-tables (usually the same as Gatherer.Filter)
Filter		cf/filter

# Where to store the log about processed buckets
BucketStateFile	tmp/lang-buckets.log
# Where to store the sequence frequencies
FrequencyFile	tmp/lang-freq.log
# Where to store the generated config file
CoefficientFile	tmp/lang-coef.log
# Where to store the log about achieved thresholds
ThresholdFile	tmp/lang-threshold.log

# How big part of the bucket-file should be used for the training-set (in promiles)
TrainingRatio	666

# Only sequences of 1..MaxSequenceLength consecutive letters will be considered
# when building the tables.  Hard maximum is 4.
MaxSequenceLength	4

# For every language, NumberOfBestSeq most typical sequences are selected
NumberOfBestSeq		400

# If a tested text contains lower number of typical sequences per character than
# the training set * FreqThreshold, we reject the detected language.
FreqThreshold	50%

# Construct the detection tables for the following languages:
# - AccentLanguages are languages with accented characters; two tables are
#   computed for each of them (accented and unaccented variant)
# - NoAccentLanguages are other languages
AccentLanguages		cs sk pl hu
NoAccentLanguages	en de nl fr es it ru

}

######## Language detector ######################################################

LangDetect {

# Language detection mode:
#   0	no automatic detection should be done
#   1	automatic detection should supplement language specified in document/metadata
#   2	only automatic detection should be used
Mode			1

# We refuse to detect the language of documents shorter than MINDOCUMENTLENGTH
# found sequencies
MinDocumentLength	50

# Include the auto-generated configuration file of the language detector
IncludeTables		cf/lang-detect

}

#endif

