# Configuration of the indexer (included by cf/sherlock)

######## General parameters #####################################################

Indexer {

# Directory we build the index in
Directory		index

# Which databases should we index:	{ String=string }
#
# bucket:<file>		a bucket file as generated by gatherd or gbatch
# text:<file>		textual input, buckets separated by a blank line
# raw:<file>		a sequence of raw buckets, each starting with 32-bit BUCKET_TYPE_xxx
#ifdef CONFIG_SHEPHERD_PROTOCOL
# indexed:<b>:<i>	a bucket file <b> with shepherd's bucket export index <i>
# remote:<host>[:...]	buckets downloaded from a remote shepherd server; options:
#				<port>		Contact Shepherd at the given port
#				W<weight>	Send buckets with this weight or more
#				M<maximum>	Send at most <maximum> buckets
#				B<n>		Send <n> best thick buckets
#				b<n>		Send <n> best thin buckets
#				S<name>		Send the given state
#				and other attributes mentioned in gather/shepherd/protocol.h
# fd:<fd>:<nobj>		shepherd server connected on a given file descriptor (used internally)
#endif
Source			bucket:db/objects

# Verbosity level of general tracing messages (some modules have their own
# tracing switches for specific purposes)
Trace			0

# Filter to be used by the indexer (usually the same as Gatherer.Filter)
Filter			cf/filter

# Default weight of a document before all bonuses and penalties are applied
DefaultWeight		64

# Names of files generated by the indexer, relative to Directory
# (see doc/indexer for who is producing and reading what)
Parameters		parameters
Attributes		attributes
Notes			notes
Labels			labels
Sites			sites
URLList			url-list
URLIndex		url-index
LabelsByID		labels-by-id
Merges			merges
CardInfo		card-info

#ifndef CONFIG_BARE
Fingerprints		fingerprints
FPSplits		fp-splits
Checksums		checksums
Links			links-by-url
RefTexts		ref-texts
ObjectGraph		graph-obj
SkeletonGraph		graph-skel
Signatures		signatures
Matches			matches
NotesSkel		notes-skel
Keywords		keywords
FeedbackGatherer	gatherer-feedback
# the following one is optional, but helpful for debugging
URLSkelList		url-skel-list
#endif

WordIndex		word-index
StringIndex		string-index
Lexicon			lexicon
LexRaw			lexicon-raw
LexOrdered		lexicon-ordered
LexWords		lexicon-words
Stems			stems
StemsOrdered		stems-ordered
References		references
StringMap		string-map
StringHash		string-hash
Cards			cards
CardAttributes		card-attrs

# If you want to use online merging of multiple indices by the search server,
# you need to generate the card-prints file as well.
#CardPrints		card-prints

# If you want to export index to the administration tools,
# this file contains some extra data.
#AdminExport		admin-export

# Indices can be also merged off-line: you can use the black-gen utility on any index
# to generate a blacklist listing cards overriden by cards in other indices, distribute the
# blacklist to search servers and run sherlockd --merge there. This is not handled by the
# default indexer scripts, you have to do it manually.
Blacklist		blacklist

# Minimum size of a document to consider it for detection of duplicates
# (an analogue of Gather.MinSummedSize)
MinSummedSize		64

# Processing of attributes -- here you can define how does the indexer process
# which object attributes. A complete list of all attributes can be found in doc/objects.
# If you want to include a whole sub-object, use `(s)' syntax. Spaces are ignored.

# These attributes are extracted from the objects and stored into per-URL blocks
# of cards, or into per-redirect sub-blocks if they come via a redirect.
# List format: (Label|OverrideLabel|OverrideBody|Card|Link)Attrs	{ Attr=letter }
LabelAttrs		D E J L T V c k s

# In some cases, object attributes can be overriden by the scanner and analysers
# called from there. Such attributes have to be declared here to propagate them
# correctly to either the per-URL block or to the object body.
OverrideBodyAttrs	H
#OverrideLabelAttrs	8 9
#OverrideBodyAttrs	8 9

# Attributes copied to the index cards (from object body; those in per-URL blocks
# are included automatically and the "X" attribute likewise).
CardAttrs		G H K M N W b e i t q u y z . (f)

# URL attributes used to generate edges of the link graph
LinkAttrs		A F I R Y a d

# Link attributes we extract reference texts for (default: none=reftexts disabled)
# List format: RefLinkTypes	{ Attr=letter }
RefLinkTypes		I R

# Maximum length of a reference text
RefMaxLength		80

# Minimum number of alphanumeric characters in a reference text
RefMinLength		3

# Maximum number of reference texts for a single card (at most 127)
RefMaxCount		10

# The lexfreq utility can be used to dump lexicon sorted by word frequencies to:
LexByFreq		lexicon-by-freq

# Enable debugging dump of lexicon equivalence classes to the following file:
LexClasses		lexicon-classes

# Average bucket size for the string hash (in entries)
StringAvgBucket		120

# Seekable file access used by the indexer (see FBParam section for details)
FileAccess		std 64K

# Non-seekable file access used by the indexer
StreamFileAccess	std 64K

# Buffer size for threaded file access used by the indexer
FileBufSize		64K

# Progress indicator: update it every Progress documents processes (0=off)
# and show on screen or in the process status line. See also bin/indexer -v.
Progress		100
ProgressScreen		0
ProgressStatusLine	1

# Sorters should delete their source files as soon as possible to conserve space
SortDeleteSrc		0

# Maximum number of objects we're willing to process
# (useful for testing indexer on a subset of documents; default: unlimited)
#MaxObjects		10000

# Enable converion of frameset documents to redirects to one of the
# frames instead of indexing the noframe version. (Useful when your documents
# contain bogus noframe texts like "the stone age is already over" :) )
FramesetToRedir		1

# MAGIC: Make the second stage of the indexer (mklex, chewer, ...) assume that its input
# consists of already preprocessed cards. You probably don't want to use this.
RawStage2Input		0

# The index can be split to several subindices based on filetype mask (4-bit filetype)
# and id mask (bottom 4 bits of URL hash or of site hash if site compression is enabled).
# List format: SubIndex	{ Name=string; TypeMask=int; IDMask=int }
#			name	types	ids
#SubIndex		main1	0x00ff	0x5555
#SubIndex		main2	0x00ff	0xaaaa
#SubIndex		img	0xff00	0xffff

# The reference chains in each subindex can be further split to slices,
# which share a single file, but which can be processed in parallel if the
# search server is configured to use multiple threads. (default: 1; at most HARD_MAX_SLICES)
Slices			1

# Some parts of the indexer are multi-threaded. Here you can set the number of threads
# (which should be probably equal to the number of CPU's your machine has) and also
# the thread stack size (defaults: 1 thread, Threads.DefaultStackSize).
Threads			1
ThreadStackSize		1M

# Reject empty objects (default=0)
RejectEmpty		1

}

#ifdef CONFIG_SHEPHERD_PROTOCOL
######## Connecting to a remote Shepherd server #################################

IConnect {

# Timeout during connection setup and initial handshake [sec]
ConnectTimeout		300

# Timeout when waiting for a reply
ReplyTimeout		300

# Number of connect retries and delay between them
RetryCount		96
RetryDelay		900

# Send gatherer feedback back to Shepherd
SendFeedback		1

}
#endif

######## The scanner (the beginning of stage 1 of the indexer) ##################

Scanner {

# How many documents are processed in a single batch (small values increase overhead
# of threading and syscalls, large values consume memory [tens of bytes per unit]).
BatchSize		10000

# Turn on if you need to run extra filter for each link
# (for example if the filter uses url_xform).
FilterLinks		0

}

######## Approximative matcher of similar documents #############################

Matcher {

# The number of signatures computed for a document.  The bigger number is
# chosen, the more memory is used and the more time is spent.  However if a too
# little number is set, then the approximation algorithm is not accurate.  If a
# zero number is set, the algorithm is disabled.
Signatures		50

# The signatures are computed from a cyclic hash of Context consecutive
# words.  Context must be neither too big (otherwise a small change in a
# word influences broad environs), nor too small (otherwise the cyclic hash
# does not involve the word order very well).
Context			4

# Only documents longer than MinLength words are merged by the approximative
# matcher.  This is convenient because a small change in a short text (about 20
# words) can be significant, though the signatures are almost equal.  The
# default value is 0.
MinWords		20

# The documents are considered similar, if at least Threshold hashes of
# their signatures are equal.  The number #EqualHashes/Signatures is a
# good approximation of the ratio of the document scope, that is equal.
Threshold		45

# The signatures are sorted Passes times on a hash chosen by random.  Each
# time, the index is read in linear and for each value of the key hash, similar
# documents are detected.  The more passes, the slower and more precise.
Passes			5

# Blocksize (in documents) of the matching pass.  Sorted signatures are
# consecutively read in blocks not longer than BLOCK and processed
# _quadratically_ in the memory.
Block			64

}

######## Resolving of fingerprints ##############################################

Resolve {

# Trace per-block statistics
Trace			0

# Number of threads (0=disable threading)
Threads			0

# Number of batches to prefetch (must not be less than the number of threads).
# With disabled threads, the value is always 1.
Prefetch		16

# Per-batch buffer size in bytes. Higher values should decrease sync/syscall overhead.
BatchSize		4K

# Maximum allowed density of hash tables (with linear probing).Limited to [0.1, 0.9].
MaxHashDensity		0.45

}

######## Merging of identical / similar documents ###############################

Merger {

# If defined, classes larger than GiantDocuments (counting duplicates)
# or than GiantRedirects (counting also redirects) are marked giant
GiantDocuments		100
GiantRedirects		3000

}

######## {URL,File} keywords processor ##########################################

Keywords {

# Level of debugging outputs: 1 explains changes, 2 is for programmers
Trace			0

# Extraction of words from URL's (a list of regex substitution rules + weight [0 to 3])
# (lexmapper is run on the result)
# List format: (URL|File)WordPattern	{ Pattern=string; Replace=string; Weight=[0123]; Hardcoded=[01] }
URLWordPattern		{ Hardcoded 1 }
FileWordPattern		{ Hardcoded 1 }
#The hardcoded rules are equivalent to the following rules, but they are much faster:
#URLWordPattern		'http://(www[^.]*\.|)([a-z0-9]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)'	\2	3
#URLWordPattern		'http://(www[^.]*\.|)([^.]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)'		\2	2
#URLWordPattern		'http://(www[^.]*\.|)([^.]+)\.[^/]*/(|(index|default)\.[a-zA-Z0-9]+)'		\2	1
#FileWordPattern	'....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?$'				\2\3	2
#FileWordPattern	'....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?\?.*'			\2\3	1

# Allow extraction of words from file names (default=0=no, 1=images only, 2=everywhere)
NameWords		1

# You can always specify the limits for URL, File, and Catalog keywords (-1=unlimited).
# Enter up to 4 numbers for weights 0..3:
#	0=ignored, 1=normal weight, 2..3=increased weight for URL and File)
# Maximum number of keywords from one URL, the others are cut
URLMaxURLWords		-1	4	4	1
URLMaxFileWords		-1	-1	-1
URLMaxCatalogWords	-1	30
# Maximum number of keywords from one site, the others are cut
SiteMaxURLWords		-1	16	8	2
SiteMaxFileWords	-1	-1	-1
SiteMaxCatalogWords	-1	50
# Maximum total number of keywords, the others are cut
TotalMaxURLWords	-1	32	24	6
TotalMaxFileWords	-1	-1	-1
TotalMaxCatalogWords	-1	60
# When a keyword has frequency bigger than the threshold, its weight is decreased by 1
DecreaseURLWords	-1	200	150	10
DecreaseFileWords	-1	-1	10000
DecreaseCatalogWords	-1	100
# When a keyword has frequency bigger than the threshold, it is removed
RemoveURLWords		-1	200	-1	-1
RemoveFileWords		-1	 -1	20000
RemoveCatalogWords	-1	100

}

######## Backlink generator #####################################################

Backlinker {

# Maximal number of frame backlinking passes (default: unlimited),
# i.e., the maximum number of frame indirections.
MaxFrameDepth		10

# Path to temporary files (relative to Indexer.Directory)
FrameGraph		frame-graph
ImageGraph		image-graph

}

######## Lexicon parameters #####################################################

Lexicon {

# For a description of what constitutes a word, see the Alphabet section in cf/libucw.

# Words shorter than this limit are ignored
# (all limits are in Unicode characters, not UTF-8)
MinWordLenIgnore	2

# Words shorter than this limit are treated as nonindexable
MinWordLen		2

# Words longer than this limit are treated as nonindexable
MaxWordLen		32

# For words composed entirely of digits, this limit is used instead
MaxNumWordLen		10

# For words composed of both letters and digits, this applies
MaxMixedWordLen		10

# Words longer than this limit and containing only ASCII characters
# are subject to extra checks which try to identify uuencode/base64 etc.
MaxCtrlWordLen		16

# Maximum size of inter-word gap (sequences of >MaxGap non-indexed words are
# squeezed to MaxGap word positions to avoid wasting precious word numbers)
MaxGap			3

#ifdef CONFIG_MAX_CONTEXTS
# How much hash slots do we use for contexts. Beware, it must be set to
# at least twice the number of context-dependent words and at most to
# SHERLOCK_MAX_CONTEXTS (see sherlock/default.cfg).
ContextSlots		256
#endif

#ifndef CONFIG_BARE
# Explicitly set categories of some words (and all their lexical variants)
# Overrides MinWordLenIgnored and MinWordLen, but not MaxWordLen.
# To be written in UTF-8 characters.
# List format: Word(Ignored|Normal|Garbage|Context)	{ Word=string }
WordIgnored		the
#WordNormal		
#WordGarbage		
#ifdef CONFIG_MAX_CONTEXTS
WordContext		of and in by is for on it this be with
WordContext		a b c d e f g h i j k l m n o p q r s t u v w x y z
WordContext		0 1 2 3 4 5 6 7 8 9 @ & + §
WordContext		www
#endif
#endif

}

######## The chewer (stage 2 of the indexer) ####################################

Chewer {

# Sizes of pre-sorting buffers for words and strings
WordBufSize		128K
StringBufSize		128K

# Bitmap of string categories we index
# Format: bitmap of string types
StringCats:all

# Maximum number of string entries we index per document
StringMax		4K

# Only first WordLimit words of a document are indexed with full
# positions suitable for phrase searching (at most 255M / 31K depending on
# CONFIG_32BIT_REFERENCES).
# Only first MetaLimit meta's of a document are indexed with full
# positions suitable for phrase searching (at most 0.5M / 2047)
#ifdef CONFIG_32BIT_REFERENCES
WordLimit		1M
MetaLimit		511K
#else
#WordLimit		31K
#MetaLimit		2047
#endif

# Size of document buffer. Any single document must fit here, if
# it doesn't, it gets trimmed.
DocBufSize		64K

# Document texts (used later for showing contexts of matching words) are truncated to the following length
ExcerptMax		256K

# Cards with too many URL's have their URL list trimmed; the same for redirects
MaxURLs			1000
MaxRedirects		100

# If we encounter a giant class (see Merger.GiantDocuments), we ignore these
# meta types (default: none) and decrease the weight of the cards by GiantPenalty.
# Ignoring means that they are not indexed into the fulltext index, however
# they remain present in the cards.
# Format: bitmap of meta types
#GiantBanMeta:all
#GiantBanMeta:remove	title keywd meta file
GiantPenalty		50

#ifndef CONFIG_BARE
# Compute average weight of a character typed in the document.  If it is bigger
# than SwindlerThreshold, the types with nonzero weight are remapped to text.
# Other types (e.g. alt, link) are left untouched.
# Format: section with numeric attributes named after word types
TypeWeights		{ text=20; emph=50; small=10; hdr1=50; hdr2=80; alt=0 }
SwindlerThreshold	40

# Penalize documents with almost no contents.  Compare the number of
# alphanumerical characters with the threshold.
NoContentsPenalty	30
NoContentsThreshold	100

# If the content-type of the document is one the following ones, check for
# existence of a title and outgoing links.  Penalize documents without them.
HyperTextTypes		text/html	#list of general strings
NoLinksPenalty		10
NoTitlePenalty		10

# If the title of the document is longer than MaxTitleLength characters, its
# weight is decreased
MaxTitleLength		100
#endif

# Cards that cannot be compressed to less than MinCompression% of their
# original size are stored uncompressed
MinCompression		90

}

######## Indexer reporter #######################################################

Reporter {

# If defined, equivalence classes larger than ClassThreshold are logged to ClassLog
ClassLog		large-classes
ClassThreshold		30

#ifdef CONFIG_FILETYPE
# Generate statistics of filetypes
FiletypeStats		1
#endif

#ifdef CONFIG_LANG
# Generate statistics of languages
LanguageStats		1
#endif

# Generate per-domain statistics of the given level (0=off, 1=top-level domains, 2=2nd-level, ...)
#DomainStats		2

# Show statistics about at most DomainStatsLimit domains (default=infinity)
#DomainStatsLimit	10

}

#ifdef CONFIG_WEIGHTS

######## Dynamic weights ########################################################

Weights {

# Trace calculation of weights
Trace			0

# The number of parallel threads of computation
Threads			1

# Maximum number of passes allowed
MaxEigenPasses		200
# If delta is not improved at least CheckThreshold/1000 times in CheckPasses
# passes, stop
CheckPasses		10
CheckThreshold		1100

# Stop when differences between two passes are less than MinChange
MinChange		3e-7

# The difference between weights of the worst and the best page
MaxWeight		255

# Characteristics of a random browser: probability that he jumps to a totally
# random page, that he jumps to a random page according to its (filter) weight,
# and that he follows a random link from the current page.  All probabilities
# are relative to each other.
ProbRandom		0
ProbWeight		10
ProbFollow		85

# Relative weight of links going within a site and between different sites.
# The higher the weight, the more pagerank goes into that link.
LinkWeight		1 2

# Coefficient of successive overrelaxation (SOR).  The default 1.0 means no
# overrelaxation, and the recommended setting is 1+epsilon depending on your data.
Overrelax		1.05

# Where to put the vector of rank sources and temporary vector of ranks,
# relative to Indexer.Directory
IntraGraph		graph-intra
LeafGraph		graph-leaf
LeafSourceRank		rank-leaf-source
ObjRank			rank-obj
SkelRank		rank-skel

# Optional file to log calculated weights to, relative to Indexer.Directory
#WeightLog		weights

}
#endif

#ifdef CONFIG_IMAGES
######## Indexing of images ################################################

ImageIndexer {

#ifdef CONFIG_IMAGES_DUP
# Temporary indexer file
ImageThumbnails		image-thumbnails
#endif

#ifdef CONFIG_IMAGES_SIM
# Similar images

# Index files. ImageSignatures and ImageClusters belong to final index. 
ImageSignaturesUnsorted	image-signatures-unsorted
ImageSignatures		image-signatures
ImageClusters		image-clusters

# To allow effective evaluating of queries with similar images,
# we build a hierarchical structure in the space of average image features.
# Search servers loads only a single leaf per query.
# Number of images in a single leaf is limited by SigMaxClusterCount. 
SigMaxClusterCount	100000
#endif

}

#ifdef CONFIG_IMAGES_DUP
MergeImages {

# Tracing level (0=silent, 1=default, 2=verbose, 3=more debug messages)
Trace				1

# Tracing level for sorter (default Sorter.Trace)
TraceSorter			0

# Number of threads (0=disabled threading)
Threads				0

# Size of per-thread stack (default=Threads.DefaultStackSize)
#ThreadStackSize		64K

# Buffer used maily for preloading of image clusters to memory,
# before they are searched for duplicates. Too small value
# may lead to many missed duplicates. Note, that the buffer
# is equally split among all threads, so a higher number of threads
# would need a larger value.
BufSize				256M

# Another size limiters for image clusters, by default both set to infinity.
#MaxClusterSize			256M
#MaxClusterCount		10000

# Large clusters which do not fit in memory are processed more times. In each pass
# the merger splits them recursively by random hyperplanes and examines resulting
# parts separately.
MaxPasses			2

# Enable support for scaled duplicates.
SupportResize			1

# Minimum size of image and its thumbnail to allow detecting of scaled duplicates.
ResizeableMinWidth		128
ResizeableMinHeight		128
ResizeableMinThumbWidth		32
ResizeableMinThumbHeight	32

# Decide whether to compare pairs of images of the same size pixel by pixel
# or just rely on the Quad-Tree. The default value of 2 enables
# the comparison for all images, 1 only for the non-resizeable images
# and 0 completely disables the comparison.
SameSizeCompare			2

# Enables merging of rotated (1) or even flipped duplicates (2).
Transformations			0

# Square root of the maximum tolerated average quadratic error
# in RGB values, when comparing two images. The range is 0..255.
PixelThreshold			8

# The maximum tolerated distance between two duplicate feature vectors.
# Too high value will lead to a very poor performance, too low value
# to many missed duplicates.
VectorThreshold			5

# Maximum relative difference between duplicate aspect ratios.
# Vector distance for a pair of such duplicate images
# is 32 * log_2(1 + AspectRatioThreshold) * VectorWeights[0].
AspectRatioThreshold		5%

# Weight multipliers for all compared features. A lower value
# means a higher tolerance to differences.
# The features are: aspect ratio, L, u, v, LH, HL, HH.
VectorWeights			100% 100% 100% 100% 20% 20% 20%

# Clusters with at least KdTreeMinCount vectors are optimized with a hierarchic
# Kd-Tree data structure. Otherwise all pairs of vectors would be compared.
KdTreeMinCount			100

# Maximum Kd-Tree depth.
KdTreeMaxDepth			20

# Completely ignore images which wound need more than MaxImageSize bytes of memory.
MaxImageSize			4M

# Maximum Quad-Tree depth.
QuadTreeLimit			8

}
#endif
#endif

