# Configuration of the indexer (included by cf/sherlock)

######## General parameters #####################################################

Indexer {

# Directory we build the index in
Directory		index

# Which database should we index:
# bucket:<file>		a bucket file as generated by gatherd or gbatch
# text:<file>		textual input, buckets separated by a blank line
# raw:<file>		a sequence of raw buckets, each starting with 32-bit BUCKET_TYPE_xxx
Source			bucket:db/objects

# Verbosity level of general tracing messages (some modules have their own
# tracing switches for specific purposes)
Trace			0

# Filter to be used by the indexer (usually the same as Gatherer.Filter)
Filter			cf/filter

# Default weight of a document before all bonuses and penalties are applied
DefaultWeight		64

# Names of files generated by the indexer, relative to Directory
# (see doc/indexer for who is producing and reading what)
Incremental		incremental
Parameters		parameters
Attributes		attributes
Notes			notes
Labels			labels
Sites			sites
URLList			url-list
LabelsByID		labels-by-id
Merges			merges

#ifndef CONFIG_BARE
Fingerprints		fingerprints
Checksums		checksums
Links			links-by-url
RefTexts		ref-texts
LinkGraph		link-graph
LinkGraphIndex		link-graph-index
Signatures		signatures
Matches			matches
NotesNew		notes-new
Keywords		keywords
FeedbackGatherer	gatherer-feedback
#endif

WordIndex		word-index
StringIndex		string-index
Lexicon			lexicon
LexRaw			lexicon-raw
LexOrdered		lexicon-ordered
LexWords		lexicon-words
Stems			stems
StemsOrdered		stems-ordered
References		references
StringMap		string-map
StringHash		string-hash
Cards			cards
CardAttributes		card-attrs

# If you want to use online merging of multiple indices by the search server,
# you need to generate the card-prints file as well.
#CardPrints		card-prints

# Indices can be also merged off-line: you can use the black-gen utility on any index
# to generate a blacklist listing cards overriden by cards in other indices, distribute the
# blacklist to search servers and run sherlockd --merge there. This is not handled by the
# default indexer scripts, you have to do it manually.
Blacklist		blacklist

# Minimum size of a document to consider it for detection of duplicates
# (an analogue of Gather.MinSummedSize)
MinSummedSize		64

# Processing of attributes -- here you can define how does the indexer process
# which object attributes. A complete list of all attributes can be found in doc/objects.
# If you want to include a whole sub-object, use `(s)' syntax. Spaces are ignored.

# These attributes are extracted from the objects and stored into per-URL blocks
# of cards, or into per-redirect sub-blocks if they come via a redirect.
# List format: (Label|OverrideLabel|OverrideBody|Card|Link)Attrs	{ Attr=letter }
LabelAttrs		D E J L T V c k s

# In some cases, object attributes can be overriden by the scanner and analysers
# called from there. Such attributes have to be declared here to propagate them
# correctly to either the per-URL block or to the object body.
#OverrideLabelAttrs	8 9
#OverrideBodyAttrs	8 9

# Attributes copied to the index cards (from object body; those in per-URL blocks
# are included automatically and the "X" attribute likewise).
CardAttrs		G K M N W b e i t q u y z .

# URL attributes used to generate edges of the link graph
LinkAttrs		A F I R Y a d

# Maximum vertex degree in the link graph (extraneous edges ignored; max. 65535)
MaxGraphDegree		65535

# Link attributes we extract reference texts for (default: none=reftexts disabled)
# List format: RefLinkTypes	{ Attr=letter }
RefLinkTypes		I R

# Maximum length of a reference text
RefMaxLength		80

# Minimum number of alphanumeric characters in a reference text
RefMinLength		3

# Maximum number of reference texts for a single card
RefMaxCount		10

# The lexfreq utility can be used to dump lexicon sorted by word frequencies to:
LexByFreq		lexicon-by-freq

# Enable debugging dump of lexicon equivalence classes to the following file:
LexClasses		lexicon-classes

# Average bucket size for the string hash (in entries)
StringAvgBucket		120

# Buffer size for various files used by the indexer (0=use mmap)
FileBufSize		64K

# Progress indicator: update it every Progress documents processes (0=off)
# and show on screen or in the process status line. See also bin/indexer -v.
Progress		100
ProgressScreen		0
ProgressStatusLine	1

# Sorters should delete their source files as soon as possible to conserve space
SortDeleteSrc		0

# Maximum number of objects we're willing to process
# (useful for testing indexer on a subset of documents; default: unlimited)
#MaxObjects		10000

# Enable converion of frameset documents to redirects to one of the
# frames instead of indexing the noframe version. (Useful when your documents
# contain bogus noframe texts like "the stone age is already over" :) )
FramesetToRedir		1

# MAGIC: Make the second stage of the indexer (mklex, chewer, ...) assume that its input
# consists of already preprocessed cards. You probably don't want to use this.
RawStage2Input		0

# The index can be split to several subindices based on filetype mask (4-bit filetype)
# and id mask (bottom 4 bits of URL hash or of site hash if site compression is enabled).
# List format: SubIndex	{ Name=string; TypeMask=int; IDMask=int }
#			name	types	ids
#SubIndex		main1	0x00ff	0x5555
#SubIndex		main2	0x00ff	0xaaaa
#SubIndex		img	0xff00	0xffff

}

######## Approximative matcher of similar documents #############################

Matcher {

# The number of signatures computed for a document.  The bigger number is
# chosen, the more memory is used and the more time is spent.  However if a too
# little number is set, then the approximation algorithm is not accurate.  If a
# zero number is set, the algorithm is disabled.
Signatures		50

# The signatures are computed from a cyclic hash of Context consecutive
# words.  Context must be neither too big (otherwise a small change in a
# word influences broad environs), nor too small (otherwise the cyclic hash
# does not involve the word order very well).
Context			4

# Only documents longer than MinLength words are merged by the approximative
# matcher.  This is convenient because a small change in a short text (about 20
# words) can be significant, though the signatures are almost equal.  The
# default value is 0.
MinWords		20

# The documents are considered similar, if at least Threshold hashes of
# their signatures are equal.  The number #EqualHashes/Signatures is a
# good approximation of the ratio of the document scope, that is equal.
Threshold		47

# The signatures are sorted Passes times on a hash chosen by random.  Each
# time, the index is read in linear and for each value of the key hash, similar
# documents are detected.  The more passes, the slower and more precise.
Passes			5

# Blocksize (in documents) of the matching pass.  Sorted signatures are
# consecutively read in blocks not longer than BLOCK and processed
# _quadratically_ in the memory.
Block			64

}

######## Merging of identical / similar documents ###############################

Merger {

# If defined, classes larger than GiantDocuments (counting duplicates)
# or than GiantRedirects (counting also redirects) are marked giant
GiantDocuments		100
GiantRedirects		3000

}

######## {URL,File} keywords processor ##########################################

Keywords {

# Level of debugging outputs: 1 explains changes, 2 is for programmers
Trace			0

# Extraction of words from URL's (a list of regex substitution rules + weight [0 to 3])
# (lexmapper is run on the result)
# List format: (URL|File)WordPattern	{ Pattern=string; Replace=string; Weight=[0123]; Hardcoded=[01] }
URLWordPattern		{ Hardcoded 1 }
FileWordPattern		{ Hardcoded 1 }
#The hardcoded rules are equivalent to the following rules, but they are much faster:
#URLWordPattern		'http://(www[^.]*\.|)([a-z0-9]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)'	\2	3
#URLWordPattern		'http://(www[^.]*\.|)([^.]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)'		\2	2
#URLWordPattern		'http://(www[^.]*\.|)([^.]+)\.[^/]*/(|(index|default)\.[a-zA-Z0-9]+)'		\2	1
#FileWordPattern	'....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?$'				\2\3	2
#FileWordPattern	'....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?\?.*'			\2\3	1

# Allow extraction of words from file names (default=0=no, 1=images only, 2=everywhere)
NameWords		1

# You can always specify the limits for URL, File, and Catalog keywords (-1=unlimited).
# Enter up to 4 numbers for weights 0..3:
#	0=ignored, 1=normal weight, 2..3=increased weight for URL and File)
# Maximum number of keywords from one URL, the others are cut
URLMaxURLWords		-1	4	4	1
URLMaxFileWords		-1	-1	-1
URLMaxCatalogWords	-1	30
# Maximum number of keywords from one site, the others are cut
SiteMaxURLWords		-1	16	8	2
SiteMaxFileWords	-1	-1	-1
SiteMaxCatalogWords	-1	50
# Maximum total number of keywords, the others are cut
TotalMaxURLWords	-1	32	24	6
TotalMaxFileWords	-1	-1	-1
TotalMaxCatalogWords	-1	60
# When a keyword has frequency bigger than the threshold, its weight is decreased by 1
DecreaseURLWords	-1	200	150	10
DecreaseFileWords	-1	-1	10000
DecreaseCatalogWords	-1	100
# When a keyword has frequency bigger than the threshold, it is removed
RemoveURLWords		-1	200	-1	-1
RemoveFileWords		-1	 -1	20000
RemoveCatalogWords	-1	100

}

######## Backlink generator #####################################################

Backlinker {

# Maximal number of frame backlinking passes (default: unlimited),
# i.e., the maximum number of frame indirections.
MaxFrameDepth		10

# Path to temporary files (relative to Indexer.Directory)
FrameGraph		frame-graph
ImageGraph		image-graph

}

######## Lexicon parameters #####################################################

Lexicon {

# For a description of what constitutes a word, see the Alphabet section in cf/library.

# Words shorter than this limit are ignored
# (all limits are in Unicode characters, not UTF-8)
MinWordLenIgnore	2

# Words shorter than this limit are treated as nonindexable
MinWordLen		2

# Words longer than this limit are treated as nonindexable
MaxWordLen		32

# For words composed entirely of digits, this limit is used instead
MaxNumWordLen		10

# For words composed of both letters and digits, this applies
MaxMixedWordLen		10

# Words longer than this limit and containing only ASCII characters
# are subject to extra checks which try to identify uuencode/base64 etc.
MaxCtrlWordLen		16

# Maximum size of inter-word gap (sequences of >MaxGap non-indexed words are
# squeezed to MaxGap word positions to avoid wasting precious word numbers)
MaxGap			3

#ifdef CONFIG_MAX_CONTEXTS
# How much hash slots do we use for contexts. Beware, it must be set to
# at least twice the number of context-dependent words and at most to
# SHERLOCK_MAX_CONTEXTS (see sherlock/default.cfg).
ContextSlots		256
#endif

#ifndef CONFIG_BARE
# Explicitly set categories of some words (and all their lexical variants)
# Overrides MinWordLenIgnored and MinWordLen, but not MaxWordLen.
# To be written in UTF-8 characters.
# List format: Word(Ignored|Normal|Garbage|Context)	{ Word=string }
WordIgnored		the
#WordNormal		
#WordGarbage		
#ifdef CONFIG_MAX_CONTEXTS
WordContext		of and in by is for on it this be with
WordContext		a b c d e f g h i j k l m n o p q r s t u v w x y z
WordContext		0 1 2 3 4 5 6 7 8 9 @ & + §
WordContext		www
#endif
#endif

}

######## The chewer (stage 2 of the indexer) ####################################

Chewer {

# Sizes of pre-sorting buffers for words and strings
WordBufSize		128K
StringBufSize		128K

# Bitmap of string categories we index
# Format: bitmap of string types
StringCats:all

# Maximum number of string entries we index per document
StringMax		4K

# Only first WordLimit words of a document are indexed with full
# positions suitable for phrase searching (at most 255M / 31K depending on
# CONFIG_32BIT_REFERENCES).
# Only first MetaLimit meta's of a document are indexed with full
# positions suitable for phrase searching (at most 0.5M / 2047)
#ifdef CONFIG_32BIT_REFERENCES
WordLimit		1M
MetaLimit		511K
#else
#WordLimit		31K
#MetaLimit		2047
#endif

# Size of document buffer. Any single document must fit here, if
# it doesn't, it gets trimmed.
DocBufSize		64K

# Document texts (used later for showing contexts of matching words) are truncated to the following length
ExcerptMax		256K

# Cards with too many URL's have their URL list trimmed; the same for redirects
MaxURLs			1000
MaxRedirects		100

# If we encounter a giant class (see Merger.GiantDocuments), we ignore these
# meta types (default: none) and decrease the weight of the cards by GiantPenalty.
# Ignoring means that they are not indexed into the fulltext index, however
# they remain present in the cards.
# Format: bitmap of meta types
#GiantBanMeta:all
#GiantBanMeta:remove	title keywd meta file
GiantPenalty		50

#ifndef CONFIG_BARE
# Compute average weight of a character typed in the document.  If it is bigger
# than SwindlerThreshold, the types with nonzero weight are remapped to text.
# Other types (e.g. alt, link) are left untouched.
# Format: section with numeric attributes named after word types
TypeWeights		{ text=20; emph=50; small=10; hdr1=50; hdr2=80; alt=0 }
SwindlerThreshold	40

# Penalize documents with almost no contents.  Compare the number of
# alphanumerical characters with the threshold.
NoContentsPenalty	30
NoContentsThreshold	100

# If the content-type of the document is one the following ones, check for
# existence of a title and outgoing links.  Penalize documents without them.
HyperTextTypes		text/html	#list of general strings
NoLinksPenalty		10
NoTitlePenalty		10

# If the title of the document is longer than MaxTitleLength characters, its
# weight is decreased
MaxTitleLength		100
#endif

# Cards that cannot be compressed to less than MinCompression% of their
# original size are stored uncompressed
MinCompression		90

}

######## Indexer reporter #######################################################

Reporter {

# If defined, equivalence classes larger than ClassThreshold are logged to ClassLog
ClassLog		large-classes
ClassThreshold		30

#ifdef CONFIG_FILETYPE
# Generate statistics of filetypes
FiletypeStats		1
#endif

#ifdef CONFIG_LANG
# Generate statistics of languages
LanguageStats		1
#endif

# Generate per-domain statistics of the given level (0=off, 1=top-level domains, 2=2nd-level, ...)
#DomainStats		2

}

######## Generation of URL keys #################################################

URLKey {

# Consider http://www.(.*) and http://\1 equivalent. This is a dirty hack which
# should be used if you remove "www." from URL keys in the filters. Will be
# replaced by better handling of duplicates in the gatherer.
WWWHack			1

# Table of equivalent URL prefixes.
PrefixTable		cf/url-equiv

}
