# This file contains configurable parameters of the Sherlock system.
#
# The configuration consists of several sections corresponding to Sherlock
# modules. Each section is introduced by
#	[SectionName]
# and contains items of type
#	Name	Value
# Each module recognizes its own subset of sections and ignores the rest.
# However, unknown items in recognized sections are reported as errors.
#
# Lines starting with "#" are treated as comments.
#
# Numeric values can be optionally suffixed by units:
#	k	1000		K	1024		%	0.01
#	m	1000000		M	1048576		d	86400 (sec/day)
#	g	1000000000	G	1073741824	h	3600 (sec/hour)
#
#ifdef NOTDEF
#
# This is a source form of the configuration which is run through a preprocessor
# which understand the usual '#ifdef', '#ifndef' '#else' and '#endif' directives,
# like a C preprocessor, which test compile-time settings defined in config.mk.
#
#endif

### Filter Engine
[Filter]

# Trace compiling, optimizing, and interpreting filters
Trace		0

# SWITCH commands with more than HASHLIMIT equality tests of strings (operators
# == and ===) are optimized using hash-tables.  TrieLimit is for building tries
# for operators =* and =** and TreeLimit is for building binary search trees for
# operators =# and =##.
HashLimit	4
TrieLimit	4
TreeLimit	4

# Switch controlling optimizing parsed filters
Optimize	1

# If set, the optimized filter is dumped into the given text file
#DumpFilterTo	tmp/optimized-filter

### Gatherer
[Gather]

# Maximum size of a single object (default: unlimited) (larger object => see below)
MaxObjSize		200000

# Allow truncation of objects (0=report error, 1=default=truncate them)
AllowTruncate		1

# Maximum size after decompression or other processing (if exceeded, trimmed)
MaxDecodeSize		1M

# Smaller documents are not checksummed for detection of duplicates during gathering
# (0=checksumming turned off)
MinSummedSize		256

# Trace various decoders
TraceDecode		0

# Trace resolving of IP addresses and queue keys
TraceResolve		0

# Filter to be used by the gatherer
Filter			cf/filter

# Log URL errors in references
LogRefErrors		1

# Log relative base URL's where it's forbidden
LogBaseErrors		1

# Access list defining which IP addresses we are allowed to connect to.
# The rules are traversed sequentially, the first match wins.
# Each rule specifies a host ornetwork address and possibly also a netmask
# or prefix length in bits. If no rules match, the default is Deny.
DenyIP			0.0.0.0/8
AllowIP			127.0.0.0/8
DenyIP			224.0.0.0/224.0.0.0
AllowIP			0.0.0.0/0

# If-Modified-Since downloads: request the new document be at least by this
# number of seconds newer than the old one (a guard against improper IMS
# implementations; -1=don't use IMS).
#MinIMSDelay		0

# If more than MaxRefreshAge seconds have elapsed since the last full
# download, disable conditional downloads and other tricks and perform
# full download. (default: unlimited, 0=always reload)
#MaxRefreshAge		 86400

# Objects that cannot be compressed to less than MinCompression% of their
# original size are stored uncompressed
MinCompression		90

### URL processing
[URL]

# Ignore spaces at start/end of URL
IgnoreSpaces		1

# Ignore underflows relative paths (/../ from root)
IgnoreUnderflow		1

# Some URL's with many repeated components are filtered out to avoid infinite
# URL's (e.g. http://czech.recoder.cz/win/iso/win/iso/file.html, or
# http://a.com/?a=b&a=b&a=b, ...).

# The URL is split to components divided by any of the specified separators.
# Then the separators are forgotten and the components between them are
# examined.
ComponentSeparators	/&?

# URL is filtered out if there's a sequence of components in a row with at most
# MaxRepeatLength components and the sequence is repeated more than MinRepeatCount
# times.  Default values are high MinRepeatCount and low MaxRepeatLength, so the
# mechanism is disabled.
MinRepeatCount		4
MaxRepeatLength		4

### HTTP downloader
[HTTP]

# Maximal number of header lines (larger => reject)
MaxHeaderLines		1000

# HTTP header fields (see RFC 2068 for an explanation)
#From			set-your-mail-address-here
#ifdef CONFIG_PDF
AcceptTypes		text/html, text/plain;q=0.3, application/pdf
#else
AcceptTypes		text/html, text/plain;q=0.3
#endif
#Referer		http://some.referring.page/you/wish/to/send
#AcceptCharset		ISO-8859-2, ISO-8859-1;q=0.5, *;q=0.2
AcceptEncoding		deflate, gzip, compress
#AcceptLanguage		cs, en;q=0.5, *;q=0.2

# User-defined header fields
#Header			X-Magic: Poof!

# Timeout in seconds for connect, header complete and body complete
ConnectTimeout		60
HeaderTimeout		60
BodyTimeout		300

# Definition of proxy (caching might be useful when testing). Please keep in mind
# that the gatherer still needs to be able to resolve host names directly, because
# it constructs QKeys from IP addresses.
# UseProxy		cache.cuni.cz
# ProxyPort		3128

# Trace HTTP dialogs
Trace			0

# Check if length of downloaded data stream matches the one given in header
# Must be turned off if you want to support SaCzech :-(
LengthChecks		0

### File downloader
[File]

# Allow scanning of directories
AllowDirs		1

### Character set detector
[Charset]

# Trace all charset operations, 2 for higher verbosity level
Trace			0

# Log server-side errors (unrecognized and mismatched charsets)
LogErrors		1

# Mapping hostname -> language list if no languages are specified
DefLang			localhost	"cs en"
DefLang			*.cz	"cs en"
DefLang			*.sk	"sk en"
DefLang			*.hu	"hu en"
DefLang			*.pl	"pl en"
DefLang			*	"cs sk en"

# List of internal language names together with corresponding official
# language codes.
Language		cs	cs sk
Language		pl	pl
Language		hu	hu

# Canonical names of all known character sets
#Charset			IBM
Charset			ISO-8859-1
Charset			ISO-8859-2
Charset			ISO-8859-3
Charset			ISO-8859-4
Charset			ISO-8859-5
Charset			ISO-8859-6
Charset			ISO-8859-7
Charset			ISO-8859-8
Charset			ISO-8859-9
Charset			ISO-8859-10
Charset			ISO-8859-11
Charset			ISO-8859-13
Charset			ISO-8859-14
Charset			ISO-8859-15
Charset			ISO-8859-16
Charset			cp852
Charset			Windows-1250
Charset			Windows-1252
Charset			CSN_369103
Charset			x-kam-cs
#Charset		x-cork
Charset			x-mac-ce
Charset			US-ASCII

# Internal hook for UTF-8 (accepted automatically for all languages if defined)
Charset			UTF-8

# Fallback charset if we were reported an unknown charset
FallbackCharset		US-ASCII

# Believe in charset reported by the server (1=always, -1=never, 0=validate it)
BelieveServer		0

# Believe in charset reported in META tags (1=always, -1=never, 0=validate it)
# (server-reported charset has always precedence unless it's known to be invalid)
BelieveMETA		0

# Usual character sets for various languages (best-first order, the first one is fallback)
AutoSets		cs	ISO-8859-2 Windows-1250 UTF-8 cp852 x-kam-cs x-mac-ce CSN_369103
AutoSets		hu	ISO-8859-2 Windows-1250 UTF-8 x-mac-ce
AutoSets		pl	ISO-8859-2 Windows-1250 UTF-8
AutoSets		*	ISO-8859-1 UTF-8

# Lists of usual and forbidden UCS-2 codes on per-language basis

CForbid			*	0000-0008 000B 000E-001F 007F-009F

CTypical		cs	00E1 010D 010F 00E9 011B 00ED 013A 013E 0148 00F3 00F4 0159
CTypical		cs	0161 0165 00FA 016F 00FD 017E 00C1 010C 010E 00C9 011A 00CD
CTypical		cs	0139 013D 0147 00D3 00D4 0158 0160 0164 00DA 016E 00DD 017D

CTypical		pl	0142 017C 015B 0119 0105 00F3 0107 0144 0141 015A 017A 017B
CTypical		pl	00D3 0106 0118 0104 0179 0143

CTypical		hu	00E9 00E1 0171 0151 00FA 00F6 00FC 00F3 00ED 00C9 00C1 0170
CTypical		hu	0150 00DA 00D6 00DC 00D3 00CD

# List of improbable characters

# Czech characters written in cp-1250 pasted into iso-8859-2 text
CImprobable		cs	008A 008D 008E 009A 009D 009E
# Czech and Slovak characters written in iso-8859-2 into cp-1250 text
CImprobable		*	00B5 00BB 00A9 00AB 
# Improbable in Czech, but common in Polish
CImprobable		cs	0105 0104
# Parts of unicode sequences of Czech characters misinterpreted as iso-8859-2 or cp-1250
CImprobable		*	0102
# Polish characters different in iso-8859-2 and cp-1250
CImprobable		pl	0161 013D 02C7

# One improbable/forbidden character is penalized like the following number of typical characters
ImprobablePenalty	5
ForbiddenPenalty	50
# One incorrect UTF-8 sequence is penalized like the following number of correct UTF-8 sequences
UTF8Penalty		20

# We believe the charset reported by the server/META-charset only if the
# average grade of non-ASCII characters (+1 for typical, -ImprobablePenalty for
# improbable, 0 for the others) is at least BelieveMinGrade/1000.  Otherwise we
# run an auto-detection.  If it fails too, then we try the original charsets
# with lower threshold BelieveMinGrade2/1000 and then, finally, use the
# fall-back charset.
BelieveMinGrade		700
BelieveMinGrade2	0

### Content-type and content-encoding detector
[Content]

# Trace all data type operations, 2 for higher verbosity level
Trace			0

# Input translation of content encodings

InEnc	x-gzip			gzip
InEnc	x-zlib			deflate
InEnc	zlib			deflate
InEnc	x-deflate		deflate
InEnc	x-compress		compress

# Enabled content encodings (don't disable any of them unless you also remove
# it from the list of accepted encodings in HTTP section). The second
# column can specify a filename suffix to be stripped when decoding.

Encoding	gzip		.gz
Encoding	deflate
Encoding	compress	.Z
Encoding	x-bzip2		.bz2

# Content types which are actually content encodings

TypeEnc	application/gzip	gzip
TypeEnc	application/compress	compress
TypeEnc	application/bzip2	x-bzip2

# Input data type translator. Several names are known as unreliable, several
# others are mere aliases... "UNKNOWN" is a stub type for type rejection,
# "ERROR" means an error message should be logged.

InType	text/x-server-parsed-html	text/html
InType	application/x-gzip		application/gzip
InType	application/x-compress		application/compress
InType	application/x-zlib		application/zlib
InType	application/octet-stream	UNKNOWN
InType	message/*			ERROR
InType	x-sherlock/*			ERROR

# Data type guessing rules (applied sequentially, must resolve to correct
# data type or to "UNKNOWN").

Bytes	0:"GIF87a"	image/gif
Bytes	0:"GIF89a"	image/gif
Bytes	0:"FORM",8:"ILBM"	image/ilbm
Bytes	0:FFD8,6:"JFIF"	image/jpeg
Bytes	0:FFD8,6:"Exif"	image/jpeg
Bytes	0:FFD8,6:"Adobe"	image/jpeg
Bytes	0:89,1:"PNG"	image/png
Bytes	0:1F9D		application/compress
Bytes	0:1F8B		application/gzip
Bytes	0:"BZh"		application/bzip2
Bytes	0:"%!PS-Adobe"	application/postscript
Bytes	0:"{\rtf1"	text/rtf
# MS Word: for DOS, with OLE, 1.x, 2.0, MacWord 4, and MacWord 5 (taken from antiword)
Bytes	0:31BE000000AB		application/msword
Bytes	0:D0CF11E0A1B11AE1	application/msword
Bytes	0:9BA52100		application/msword
Bytes	0:DBA52D00		application/msword
Bytes	0:FE37001C0000		application/msword
Bytes	0:FE3700230000		application/msword
Extension	html		text/html
Extension	htm		text/html
Extension	tex		text/tex
Extension	txt		text/plain
Extension	wml		text/vnd.wap.wml
Extension	rtf		text/rtf
Extension	xml		text/plain
Extension	gz		application/gzip
Extension	Z		application/compress
Extension	bz2		application/bzip2
Extension	tar		application/tar
Extension	tgz		application/gzip
Extension	gif		image/gif
Extension	jpeg		image/jpeg
Extension	jpg		image/jpeg
Extension	png		image/png
Extension	bmp		image/x-bmp
Extension	pcx		image/x-pcx
Extension	ico		image/x-ico
Extension	tif		image/tiff
Extension	tiff		image/tiff
Extension	ppm		image/x-portable-pixmap
Extension	wrl		model/vrml
Extension	css		text/css
Extension	ps		application/postscript
Extension	eps		application/postscript
Extension	ai		application/postscript
Extension	pdf		application/pdf
Extension	mid		application/x-midi
Extension	avi		application/x-avi
Extension	mov		application/x-mov
Extension	zip		application/x-zip
Extension	arj		application/x-arj
Extension	jar		application/x-jar
Extension	rar		application/x-rar
Extension	lzh		application/x-lharc
Extension	exe		application/x-exe
Extension	hqx		application/x-hqx
#Extension	doc		application/msword
Extension	xls		application/excel
Extension	ppt		application/powerpoint
Extension	602		application/x-602
Extension	dvi		application/x-dvi
Extension	wmf		application/x-msmetafile
Extension	swf		application/x-shockwave-flash
Extension	rpm		application/x-rpm
Extension	deb		application/x-deb
Extension	pgp		application/x-pgp
Extension	dxf		application/x-dxf
Extension	au		audio/basic
Extension	mp3		audio/mpeg
Extension	mp2		audio/mpeg
Extension	m3u		audio/x-mpegurl
Extension	aiff		audio/x-aiff
Extension	wav		audio/x-wav
Extension	ra		audio/x-realaudio
Extension	mpe		video/mpeg
Extension	mpeg		video/mpeg
Extension	mpg		video/mpeg
Extension	asf		video/x-ms-asf
Extension	asx		video/x-ms-asx
Extension	dll		application/octet-stream
Extension	nlm		application/octet-stream
Extension	wma		application/octet-stream
IsAscii			text/plain
IsAscii			text/html
IsAscii			text/vnd.wap.wml
IsAscii			x-sherlock/robots

# A file is considered to be an ASCII-file if the fraction of ASCII-characters
# is at least AsciiFraction/1000
AsciiFraction		990

### Document parser
[Parse]

# Trace document parsing
Trace			0

# Maximum number of document conversions (default: 0=unlimited)
MaxConversions		10

# A list of content-type parsing rules
# When adding new types, remember to update HTTP.AcceptTypes as well
Type			text/plain		text
Type			text/html		html
Type			x-sherlock/robots	robots
#ifdef CONFIG_PDF
Type			application/pdf		pdf
#endif

# A list of content-encoding conversion rules
Encoding		gzip		gzip
Encoding		deflate		deflate
Encoding		compress	compress
#Encoding		bzip2		bzip2

### Document validator (needs to be enabled in filters)
[Validate]

# Do validate documents during parsing
Validate		0

# Rules mapping content types to validation commands
# (they get document contents on their stdin and should spit error messages
# to either stdout or stderr)
Validator		text/html	/opt/wdg-validator/bin/validate

### HTML parser
[HTML]

# Trace execution
Trace			0

# Comment parsing mode:	0=standard
#			1=historical (any ">" terminates)
#			2=netscape (any "-->" terminates)
CommentMode		2

# Hack to minimize confusion caused by missing end quotes.
# 0	Do nothing
# 1	EOL terminates quoted string
# 2	">" after first EOL terminates quoted string
QuoteHack		2

# Hack to work around broken script ends
# 0	Scripts end with </[a-zA-Z] (as per HTML 4.01 specs)
# 1	Scripts end with </script>
ScriptHack		1

# Hack to work around incorrectly terminated character references
CharRefHack		1

# Ignore unknown character references instead of leaving them in text
IgnoreUnknownCRs	0

# Log META tags (0=off, 1=unknown ones, 2=all)
LogMetas		0

# Search for charset in META tags
MetaCharset		1

# Respect comments of type <!-- robots:noindex --> and <!-- /robots:noindex -->
# as a local alternative to the META robot control elements.
RobotComments		1

# Selected META tags can be recognized and stored as object attributes.
# This overrides their default meaning for the HTML parser.
#MetaAttr		1 keywords

### Robot file parser
[Robots]

# Tracing
Trace			0

# Work around common errors
WorkArounds		1

# Name of our robot we search for
RobotName		holmes

### Interface to external parsers
[ExternalParser]

# Parameters: source content-type, destination content-type, and command with
# its parameters.  These parsers need not be inserted again using Parse.Type.
#AddParser	application/postscript	text/plain	bin/ps-parse
#AddParser	application/msword	text/plain	antiword -t -

#ifdef CONFIG_PDF
[PDF]

# Tracing
Trace			0

# Max. count of indirect objects in PDF file = XrefTabSize^2
XrefTabSize		1000

# Respect if the permissions of the documents forbid extraction of text
RespectUserRights	1

# Log warnings about unsupported features
Warnings		1
#endif

### Gather Daemon
[GatherD]

# Where to send the log (default: stderr)
LogFile			log/gatherd-%Y%m%d

# Maximum size of bucket file in kilobytes (default: 0=unlimited)
MaxBucketFileSize	0

# If the number of known hosts exceeds this number, the gatherer
# shuts down automatically (default: 0=unlimited)
MaxHostCount		0

# Maximal number of running gatherer threads
MaxThreads		8

# Trace thread execution
TraceThreads		0

# Minimal delay in seconds between two accesses to the same server
MinServerDelay		30

# Retry timers for temporary errors (timeouts, DNS failures etc.):
# <= RecErrLimit errors: retry after RecErrDelay1 seconds
# <= MaxRecErr errors: retry after RecErrDelay2 seconds
# > MaxRecErr errors: consider the error permanent
RecErrLimit		5
MaxRecErr		20
RecErrDelay1		300
RecErrDelay2		4h

# Trace all references (0=off, 1=only forbidden ones, 2=all)
TraceRefs		0

# Add reference to server root automatically?
AutoGoRoot		1

# Ignore all references and gather only directly specified files (default=0)
#IgnoreRefs		1

# Maximum number of URL's per server section (default: unlimited)
# The soft limit bounds the maximum number of gathered items,
# the hard one bounds number of URL's in the URLdb.
#SoftMaxObj		1000
#HardMaxObj		1000

# Number of pages of queue cache
QueueCacheSize		64

# Cache sizes for URL and MD5 databases (in pages)
#URLDbCacheSize		4096
#MD5DbCacheSize		256

# Maximum time in seconds a subprocess is allowed to run (must be <=1000000)
MaxRunTime		600

# Dump full document contents to objects (for debugging)
DumpFullObjs		0

# Sync all databases automatically after this number of objects processed
# (default: 0=don't)
AutoSync		1

# Maximum number of threads resolving IP addresses simultaneously
MaxResolvers		4

# Size of hash table for host names and for queue keys
HostHashSize		16384
KeyHashSize		16384

# The gatherer lock file
LockFile		db/GATHERLOCK

# Names of host and queue files
HostFile		db/hosts
HostFileBak		db/hosts.bak
QueueFile		db/queue

# Name of URL database file
URLDBFile		db/URL.db

# Name of MD5 hash database file (used for early detection of duplicates,
# undefine if you want to disable the detector; you don't need that unless
# the pages you scan contain a lot of identical subtrees as the duplicates
# are handled by the indexer anyway and this only speeds up gathering)
MD5DBFile		db/MD5.db

# Mixing coefficient for average document change time blending:
# new = (mix/256)*old + (1-(mix/256))*measured
DocChangeMix		128

# Gatherd can use a probabilistic data structure for detection of known URL's.
# It speeds up reference processing significantly at the expense of a small
# fraction of URL's being incorrectly treated as already known. To enable it,
# set TricksterErrProb to negative binary logarithm of desired probability
# of a single error (default: 0=disabled).
TricksterErrProb	20

# When allocating the aforementioned data structure, leave room for at least
# TricksterStep more URL's.
TricksterStep		16384

# When SIGTERM is received, try hard to stop quickly by killing subprocesses
# (by default, we wait for them to finish). This can lead to orphaned buckets,
# which is fixed by the expirer automatically and also killed subprocesses
# are counted as recoverable errors and thus affect retrying.
HardShutdown		0

# Compress generated buckets (0=no, 1=yes, -1=generate ancient buckets [v3.0])
Compress		1

### Batch gatherer
[GBatch]

# Gather each document in a subprocess, avoiding possible resource leaks
Subprocess		0

### Expirer
[Expire]

# Enable expirer tracing (1=verbose, 2=list verdicts, 3=list ID tables)
Trace			1

# Once an object's age exceeds MinRevalidateAge, the objects enters the
# revalidation cycle and it's revalidated once per RevalidateCycle seconds.
# The position on the cycle depends on hash of the URL to get the revalidations
# distributed evenly. MinRevalidateAge < RevalidateCycle is highly recommended.
# With RevalidateCycle set to zero, the cycle thing is supressed and every
# object older than MinRevalidateAge is queued immediately.
MinRevalidateAge	7d
RevalidateCycle		14d

# Maximal age of queued object before it gets discarded (seconds; default: unlimited)
QueueDiscardAge		14d

# Maximal age of error marker before it gets discarded (seconds; default: unlimited)
ErrorDiscardAge		14d

# Queue keys have to be expired in a different way as we don't store their
# creation times. Each time the expirer is run, it expires a fraction
# of the queue keys according to some hash function, so that after
# approximately QKeyExpireAge seconds, all queue keys are expired.
QKeyExpireAge		14d

# Maximal age of robots.txt
RobotExpireAge		14d

# Name of temporary queue file
TmpQueueFile		db/queue.tmp

# Name of expiration timestamp file
StampFile		db/expire-stamp

# Put at most this number of objects per host in the queue file,
# the rest is kept only in the URL database. Set to a number higher
# than the expected number of pages gathered from one host during
# one day. (default: unlimited)
QueuePostpone		2000

# Queue prioritization: each document gets priority equal to:
#    its age in seconds
#  + QueueBonusRefresh		if it's scheduled for refresh
#  + QueueBonusRegather		if it's scheduled for regathering
#  - QueuePenaltyRetry		per retry after a recoverable error
#  + per-host bonus set by filter
QueueBonusRefresh	1000000000
QueueBonusRegather	2000000000
QueuePenaltyRetry	43200

# Drawing of an age histogram (optional) with HistNumBoxes+1 boxes
# (last one for items not fitting anywhere else) per HostBoxWidth sec.
HistNumBoxes		28
HistBoxWidth		86400

### Buckets
[Buckets]

# Name of the bucket file
BucketFile		db/objects

# Size of I/O buffer
BufSize			64K

# Size of shakedown buffer
# The largest bucket in the file must fit there
ShakeBufSize		2M

# Shakedown security: 0=low (fastest, but system crash can cause loss of some
# buckets), 1=safe (use a backup buffer of ShakeBufSize bytes on disk, only
# marginally slower), 2=synchronous (fsync after each block written)
ShakeSecurity		2

# Size of I/O buffer for reads of the whole bucket file (0=use mmap)
SlurpBufSize		64K

### Memory mapped access to files: whenever you specify 0 for I/O buffer
### size, memory mapping is used instead.
[FBMMap]

# Map this many bytes at once (needs to be a multiple of CPU page size)
WindowSize		1M

# When in need to extend a file, grow it by so many bytes (>= page size)
ExtendSize		1M

### Dumper
[Dumper]

# Charset using for output on terminal (default: utf-8)
TerminalCharset         iso-8859-2

# Width of terminal (X attributes are formatted to this right margin)
TerminalWidth		78

### Temporary files
[Tempfiles]

# Template for temporary file names (first %d=PID, second %d=counter)
Template		tmp/temp%d-%d

### Sorter
[Sorter]

# Trace sorting (print pass statistics etc.)
Trace			1

# Buffer used for presorting
PresortBuffer		64K

# Per-stream buffer (0=use mmap)
StreamBuffer		64K

### Indexer
[Indexer]

# Directory we build the index in
Directory		index

# Which database should we index:
# bucket:<file>		a bucket file generated by gatherd or gbatch
# text:<file>		textual input, buckets separated by a blank line
Source			bucket:db/objects

# Names of files generated by the indexer, relative to Directory
# (see doc/indexer for who is producing and reading what)
Parameters		parameters
Attributes		attributes
Notes			notes
Labels			labels
Sites			sites
URLList			url-list

#ifndef CONFIG_BARE
Fingerprints		fingerprints
LabelsByID		labels-by-id
Checksums		checksums
Links			links-by-url
RefTexts		ref-texts
LinkGraph		link-graph
LinkGraphIndex		link-graph-index
Merges			merges
Signatures		signatures
Matches			matches
NotesNew		notes-new
Keywords		keywords
FeedbackGatherer	gatherer-feedback
#endif

WordIndex		word-index
StringIndex		string-index
Lexicon			lexicon
LexRaw			lexicon-raw
LexOrdered		lexicon-ordered
LexWords		lexicon-words
Stems			stems
StemsOrdered		stems-ordered
References		references
StringMap		string-map
StringHash		string-hash
Cards			cards
CardAttributes		card-attrs

# If you want to use online merging of multiple indices by the search server,
# you need to generate the card-prints file as well.
#CardPrints		card-prints

# Minimum size of document to consider it in detection of duplicates
# (an analogue of Gather.MinSummedSize)
MinSummedSize		64

#ifdef CONFIG_LANG
# Enable automatic recognition of languages
AutoLang		1
#endif

# Labels extracted from object description and kept for each URL
# of multi-URL documents (see doc/objects for attribute types)
LabelAttrs		DEJKLTVcs

# Attributes containing edges of the link graph
LinkAttrs		AFIRYad

# Maximum vertex degree in link graph (extraneous edges ignored; max. 65535)
MaxGraphDegree		65535

# Link types we extract reference texts for (default: none=disabled reftexts)
RefLinkTypes		IR

# Maximum length of reference text
RefMaxLength		80

# Minimum number of alphanumeric characters in reference text
RefMinLength		3

# Maximum number of reference texts for a single card
RefMaxCount		10

# Filter to be used by the indexer (usually the same as Gatherer.Filter)
Filter			cf/filter

# The lexfreq utility can be used to dump lexicon sorted by word frequencies to:
LexByFreq		lexicon-by-freq

# Enable debugging dump of lexicon equivalence classes
LexClasses		lexicon-classes

# Average bucket size for string hash (in entries)
StringAvgBucket		120

# Buffer size for various files used by the indexer (0=use mmap)
FileBufSize		64K

# Progress indicator: update it every Progress documents processes (0=off)
# and show on screen or in the process status line
Progress		100
ProgressScreen		0
ProgressStatusLine	1

# Sorters: Delete source files as soon as possible
SortDeleteSrc		0

# Maximum number of objects we're willing to process
# (useful for testing indexer on a subset of documents; default: unlimited)
#MaxObjects		10000

# Enable converion of frameset documents to redirects to one of the
# frames instead of indexing the noframe version. (Useful when your documents
# contain bogus noframe texts like "the stone age is already over" :) )
FramesetToRedir		1

# Default weight of a document before all bonuses and penalties are applied
DefaultWeight		64

### These settings apply if the indexer is communicating with a Shepherd
### server remotely.
[IConnect]

# Timeout during connection setup and initial handshake [sec]
ConnectTimeout		300

# Timeout when waiting for a reply
ReplyTimeout		300

# Number of connect retries and delay between them
RetryCount		96
RetryDelay		900

# Send gatherer feedback back to Shepherd
SendFeedback		1

### Approximative matcher of similar documents
[Matcher]

# The number of signatures computed for a document.  The bigger number is
# chosen, the more memory is used and the more time is spent.  However if a too
# little number is set, then the approximation algorithm is not accurate.  If a
# zero number is set, the algorithm is disabled.
Signatures	50

# The signatures are computed from a cyclic hash of CONTEXT consecutive
# words.  Context must be neither too big (otherwise a small change in a
# word influences broad environs), nor too small (otherwise the cyclic hash
# does not involve the word order very well).
Context		4

# Only documents longer than MinLength words are merged by the approximative
# matcher.  This is convenient because a small change in a short text (about 20
# words) can be significant, though the signatures are almost equal.  The
# default value is 0.
MinWords	20

# The documents are considered similar, if at least THRESHOLD hashes of
# their signatures are equal.  The number #EQUALHASHES/SIGNATURES is a
# good approximation of the ratio of the document scope, that is equal.
Threshold	47

# The signatures are sorted PASSES times on a hash chosen by random.  Each
# time, the index is read in linear and for each value of the key hash, similar
# documents are detected.  The more passes, the slower and more precise.
Passes		5

# Blocksize (in documents) of the matching pass.  Sorted signatures are
# consecutively read in blocks not longer than BLOCK and processed
# _quadratically_ in the memory.
Block		64

### {URL,File} keywords processor.
[Keywords]

# Level of debugging outputs: 1 explains changes, 2 is for programmers
Trace			0

# Extraction of words from URL's (a list of regex substitution rules + weight [0 to 3])
# (lexmapper is run on the result)
HardcodedURLPattern
HardcodedFilePattern
#The hardcoded rules are equivalent to the following rules, but they are much faster:
#URLWordPattern		http://(www[^.]*\.|)([a-z0-9]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)	\2	3
#URLWordPattern		http://(www[^.]*\.|)([^.]+)\.[a-z]+/(|(index|default)\.[a-zA-Z0-9]+)	\2	2
#URLWordPattern		http://(www[^.]*\.|)([^.]+)\.[^/]*/(|(index|default)\.[a-zA-Z0-9]+)	\2	1
#FileWordPattern		....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?$	\2\3	2
#FileWordPattern		....://[^?]*/(([^/.?=;]+)|([^/?=;]+)\.[^/.?=;]*)/?\?.*	\2\3	1

# Allow extraction of words from file names (default=0=no, 1=images only, 2=everywhere)
NameWords		1

# You can always specify the limits for URL, File, and Catalog keywords (-1=unlimited).
# Enter up to 4 numbers for weights 0..3:
#	0=ignored, 1=normal weight, 2..3=increased weight for URL and File)
# Maximum number of keywords from one URL, the others are cut
URLMaxURLWords		-1	4	4	1
URLMaxFileWords		-1	-1	-1
URLMaxCatalogWords	-1	30
# Maximum number of keywords from one site, the others are cut
SiteMaxURLWords		-1	16	8	2
SiteMaxFileWords	-1	-1	-1
SiteMaxCatalogWords	-1	50
# Maximum total number of keywords, the others are cut
TotalMaxURLWords	-1	32	24	6
TotalMaxFileWords	-1	-1	-1
TotalMaxCatalogWords	-1	60
# When a keyword has frequency bigger than the threshold, its weight is decreased by 1
DecreaseURLWords	-1	200	150	10
DecreaseFileWords	-1	-1	10000
DecreaseCatalogWords	-1	100
# When a keyword has frequency bigger than the threshold, it is removed
RemoveURLWords		-1	200	-1	-1
RemoveFileWords		-1	 -1	20000
RemoveCatalogWords	-1	100

### Backlink generator
[Backlinker]

# Maximal number of frame backlinking passes (default: unlimited),
# that is the maximum number of frame indirections.
MaxFrameDepth		10

# Path to temporary files (relative to Indexer.Directory)
FrameGraph		frame-graph
ImageGraph		image-graph

### Lexicon
[Lexicon]

# Words shorter than this limit are ignored
# (all limits are in Unicode characters, not UTF-8)
MinWordLenIgnore	2

# Words shorter than this limit are treated as nonindexable
MinWordLen		2

# Words longer than this limit are treated as nonindexable
MaxWordLen		32

# Maximum length of words resembling hexadecimal numbers (i.e.,
# composed from both digits and letters)
MaxHexWordLen		10

# Words longer than this limit and containing only ASCII characters
# are subject to extra checks which try to identify uuencode/base64 etc.
MaxCtrlWordLen		16

# Maximum size of inter-word gap (sequences of >MaxGap non-indexed words are
# squeezed to MaxGap word positions to avoid wasting precious word numbers)
MaxGap			3

#ifdef CONFIG_MAX_CONTEXTS
# How much hash slots do we use for contexts. Beware, it must be set to
# at least twice the number of context-dependent words and at most to
# SHERLOCK_MAX_CONTEXTS (see config.mk).
ContextSlots		256
#endif

#ifndef CONFIG_BARE
# Explicitly set categories of some words (and all their lexical variants)
# Overrides MinWordLenIgnored and MinWordLen, but not MaxWordLen.
# To be written in UTF-8 characters.
WordIgnored		the
WordNormal		
WordGarbage		
#ifdef CONFIG_MAX_CONTEXTS
WordContext		of and in by is for on it this be with
WordContext		a b c d e f g h i j k l m n o p q r s t u v w x y z
WordContext		0 1 2 3 4 5 6 7 8 9 @ & +
WordContext		www
#endif
#endif

### Character classes used by the indexer
[Alphabet]
# In indexer's eyes, characters can be of the following types:
#	Space		space separating words
#	Alpha		alphabetical character forming words
#	Punct		punctuation separating words
#	Break		punctuation separating sentences
#	Singleton	single-character words
#	Inherit		inherit properties from UniCode categories
# by default, all characters are of type Space.

Inherit			0000-02AF 0400-052F FB00-FB06
Singleton		@ & +
Break			. ? !

### Chewer
[Chewer]

# Sizes of pre-sorting buffers for words and strings
WordBufSize		128K
StringBufSize		128K

# Bitmap of string categories we index
StringCats		0xffff

# Maximum number of string entries we index per document
StringMax		4K

# Only first PhraseLimit words of a document are indexed with full
# positions suitable for phrase searching (at most 4095)
PhraseLimit		4095

# Object attributes to copy to index cards (LabelAttrs are included automatically, "X" too)
CardAttrs		GKMNbekiqyz.

# Size of document buffer. Any single document must fit here, if
# it doesn't, it gets trimmed.
DocBufSize		64K

# Chewed documents are truncated to the following length
ExcerptMax		256K

# If we encounter a giant class (see Merger.GiantDocuments), we ignore these
# meta types (default: none) and decrease the weight of the cards by GiantPenalty.
# Ignoring means that they are not indexed into the fulltext index, however
# they remain present in the cards.
#GiantBanMeta		0xffe8
GiantPenalty		50

#ifndef CONFIG_BARE
# Compute average weight of a character typed in the document.  If it is bigger
# than SwindlerThreshold, the types with nonzero weight are remapped to text.
# Other types (e.g. alt, link) are left untouched.
#			resvd text  emph  small hdr   HDR   alt
TypeWeights		0     20    50    10    50    80    0
SwindlerThreshold	40

# Penalize documents with almost no contents.  Compare the number of
# alphanumerical characters with the threshold.
NoContentsPenalty	30
NoContentsThreshold	100

# If the content-type of the document is one the following ones, check for
# existence of a title and out-going links.  Penalize documents without them.
AddHyperTextType	text/html
NoLinksPenalty		10
NoTitlePenalty		10
#endif

# Cards that cannot be compressed to less than MinCompression% of their
# original size are stored uncompressed
MinCompression		90

### Merging of identical / similar documents
[Merger]

# If defined, classes larger than GiantDocuments (counting duplicates)
# or than GiantRedirects (counting also redirects) are marked giant
GiantDocuments		100
GiantRedirects		3000

### Indexer reporter
[Reporter]

# If defined, equivalence classes larger than ClassThreshold are logged to ClassLog
ClassLog		large-classes
ClassThreshold		30

#ifdef CONFIG_FILETYPE
# Generate statistics of filetypes
FiletypeStats		1
#endif

#ifdef CONFIG_LANG
# Generate statistics of languages
LanguageStats		1
#endif

# Generate per-domain statistics of the given level (0=off, 1=top-level domains, 2=2nd-level, ...)
#DomainStats		2

### Generation of URL keys
[URLKey]

# Consider http://www.(.*) and http://\1 equivalent. This is a dirty hack which
# should be used if you remove "www." from URL keys in the filters. Will be
# replaced by better handling of duplicates in the gatherer.
WWWHack			1

# Table of equivalent URL prefixes.
PrefixTable		cf/url-equiv

#ifdef CONFIG_LANG
### Language processing
[Lang]

# List of known languages (names according to RFC 1766) together with their aliases
Language	en
Language	cs cz
Language	sk
Language	pl
Language	hu
Language	de
Language	nl
Language	fr
Language	es
Language	it

# List of stemming rules (language, algorithm, optional parameters)
# Porter's stemming algorithm for English
#Stemmer		en	porter

# List of synonymic dictionaries (language, file)
# These are text files, lines correspond to synonymic classes (not necessarily forming
# an equivalence), words are separated by colons.
#SynDict		cs	dict/cs/synonyma

### Computing the tables of the language detector
[LangTables]

# Print debugging information?  (2 is very verbose.)
Trace		0
# Number of documents to update progress indicator after (0=off)
Progress	0

# Filter to be used by lang-tables (usually the same as Gatherer.Filter)
Filter		cf/filter

# Where to store the log about processed buckets
BucketStateFile	tmp/lang-buckets.log
# Where to store the sequence frequencies
FrequencyFile	tmp/lang-freq.log
# Where to store the generated config file
CoefficientFile	tmp/lang-coef.log
# Where to store the log about achieved thresholds
ThresholdFile	tmp/lang-threshold.log

# How big part of the bucket-file should be used for the training-set (in promiles)
TrainingRatio	666

# Only sequences of 1..MAXSEQUENCELENGTH consecutive letters will be considered
# when building the tables.  Hard maximum is 4.
MaxSequenceLength	4

# For every language, NUMBEROFBESTSEQ most typical sequences are selected
NumberOfBestSeq		400

# Construct the detection tables for the following languages:
# - AccentLanguages are languages with accented characters; two tables are
#   computed for each of them (accented and unaccented variant)
# - NoAccentLanguages are other languages
AccentLanguages		cs sk pl hu
NoAccentLanguages	en de nl fr es it

[LangDetect]

# We refuse to detect the language of documents shorter than MINDOCUMENTLENGTH
# found sequencies
MinDocumentLength	200

# Include the auto-generated configuration file of the language detector
IncludeTables cf/lang-detect

#endif

### Search server
[Search]

### Daemon settings

# Name of log file (if not specified, stderr is used)
LogFile			log/sherlockd-%Y%m%d

# Name of status file (sherlockd writes it when it starts accepting queries)
StatusFile		lock/sherlockd.status

# Log incoming and rejected connections
LogIncoming		0
LogRejected		0

# Log requests
LogRequests		1

# Log replies (0=off, 1=short, 2=verbose)
LogReplies		2

# Port we listen on
Port			8192

# This enables a so called "hydra mode" in which the search server runs a given number
# of child processes on the same data, listening on ports Port, Port+1, ...
# It is just a temporary hack to make sherlockd use multiple CPU's on SMP systems.
#HydraProcesses		2

# Maximal number of connections on listen queue (max. is OS dependent)
ListenQueue		32

# Access list (IP addresses only). See Gather.AllowIP for details on access lists.
Allow			127.0.0.1
# hardcoded last rule is Deny 0.0.0.0/0

# Incoming connection idle timeout in seconds
ConnTimeOut		60

# Password for control functions
ControlPassword		FooBar

# Number of cached replies (must be non-zero)
CacheSize		10

# If query processing takes more than this number of seconds, the search
# server considers itself (dead|live)locked and dies. (default: 0=off)
QueryWatchdog		60

# Maximum size of memory mapped references (in MB)
MemMapZone		16

# If two mappings are closer than this number of bytes, merge them
MemMapElideGaps		16K

# If the mappings are smaller than this number of MB, prefetch them
MemMapPrefetch		4

## List of all databases we search in. If there are multiple databases, each of them
## has its own Database line followed by WordIndex etc. for that database.
# Database declaration: name, parameter file, card list, attribute file, reference list
Database		main index/parameters index/cards index/card-attrs index/references
# Word index [optional]: lexicon [stems]
WordIndex		index/lexicon index/stems
# String index [optional]: string map, string hash
StringIndex		index/string-map index/string-hash
# Points per word, string and meta types (see lib/index.h for a list), subject to scaling by WordWeightScale
# For meta types, they can consist of multiple entries separated by a slash which correspond to different meta weights.
# String types have no position info, so we need to compensate BlindPenalty here.
#			resvd text  emph  small hdr   HDR   alt   word7
WordWeights		0     20    50    10    50    80    10    0
#			title keywd misc  urlkw         file      ext   <6>   <7>   <8>   <9>   <10>  <11>  <12>  <13>  <14>  <15>
MetaWeights		100   0     0     50/80/150/200 10/20/20  80    0     0     0     0     0     0     0     0     0     0
#			resvd url   host  dom'n ref   link  <6>   <7>
StringWeights		0     10    10    10    10    10    0     0
# If you want to merge multiple indices using card fingerprints
#CardPrints		index/card-prints
# If this database is optional
#IsOptional		1
## end of database local data

### Limits

# Maximum number of matching documents we find (they are guaranteed to be the best ones)
NumMatches		1000

# Maximum number of objects output (default: unlimited)
MaxOutputObjects	100

# Maximal number of unique words in the expression (including word complexes)
# Hard maximum is HARD_MAX_WORDS in search/sherlockd.h
MaxWords		32

# The same for phrases (including automatic near-matchers)
MaxPhrases		8

# Maximal number of boolean expression elements (beware, hard maximum
# is 32 and space complexity of boolean expression processing is
# exponential in the number of elements used)
MaxBools		16

# Maximal number of real words matching single word in the expression
# (there can be multiple ones due to wildcards and pattern matching)
MaxWordMatches		256

# Maximum number of lexicon entries to match for wildcards
MaxWildcardZone		64K

# Minimum size of non-wildcard prefix in wildcard search
MinWildcardPrefix	1

### Query parameters

# Debugging flags [DEBUG in query] (see doc/search for explanation)
Debug			2

# Maximum number of context chars to show [CONTEXT in query]
ContextChars		240

# Maximum number of title/metatitle/keyword chars to show [TITLELEN in query]
TitleChars		240

# Maximum number of printed context intervals [INTERVALS in query]
Intervals		4

# Maximum number of results per site [SITEMAX in query] (default=0=unlimited)
SiteMax			0

# Maximum number of URL's to show for single card [URLMAX in query] (default=unlimited)
#URLMax			1

# Maximum number of redirect URL's (attribute y) for a single URL (default=unlimited)
RedirectURLMax		8

# Error recovery options. Hard errors (e.g., syntax errors in the query) always
# cause the query to be rejected, processing of soft errors (non-indexed words,
# wildcard expansion too large etc.) depends on these switches: if AllowApprox
# is turned on, they are just reported as word status codes, else if PartialAnswers
# are turned on, they cause only a lookup in a single database to fail with
# a local error message; if both are turned off, the query is rejected.
# These options can be overriden for a single query by APPROX and PARTIAL.
AllowApprox		0
PartialAnswers		0

# Default accent mode [ACCENT in query]:
# 3 (auto2) = same as auto, but done separately for each word
# 2 (strict) = require accents to match
# 1 (ignore) = ignore accents completely
# 0 (auto) = if query contains no accents, ignore accents,
#            else match accents in accented documents and ignore otherwise
AccentMode		0

# Enable morphing of words [MORPH in query]: for each word in the query, automatically
# add its stem and other morphological variants, but with a slight penalty. If set to 2,
# differently accented versions of these variants are added as well according to AccentMode.
Morphing		2

# Enable spelling checker and set the number of variants to find [SPELL in query] (max. 16)
Spelling		4

# Enable searching for synonyma [SYN in query]
# 0=off, 1=only list, 2=search, 3=also accents, 4=morph according to MORPH setting
Synonyming		0

# Which synonymic variants are automatically added to the query when SYN >= 2 [SYNEXP in query]
# a bit map, bit 31 controls variants with id>=31.
#SynExpand		0xffffffff

# Secondary sorting criterion on Q ties [SORTBY in query] (default: SITE)
DefaultSortBy		SITE

# A bitmap of word types used when no type is specified in the query
# lower 8 bits are word types, upper 16 are meta types
DefaultWordTypes	0xffff00ff

### Spelling checker parameters (all frequencies are logarithmic scaled to 0..255)

# Too short words are not checked
SpellMinLen		3

# Words with this or greater frequency are considered correct and not checked
SpellGoodFreq		128

# Only variants with frequency greater by at least SpellMargin are considered
SpellMargin		30

# If the original word has frequency < SpellDwarf and no matches are found
# with the default margin, use SpellDwarfMargin instead.
SpellDwarf		5
SpellDwarfMargin	1

# When sorting spelling checker output, we calculate similarity points
# by taking 100*frequency - sum of penalties for all types of differences:
# adding/deleting/changing a single character, transposing two characters,
# changing accents.
SpellAddPenalty		3000
SpellDelPenalty		3000
SpellModPenalty		2000
SpellXPosPenalty	0
SpellAccentPenalty	300

# Common letter changes: they get SpellCommonPenalty instead of SpellModPenalty;
# always considered without accents.
SpellCommonPairs	yi sz dt
SpellCommonPenalty	500

### Calculating Q factor

# Default points per word occurence
WordBonus		10000

# Conversion of document weights to Q
DocWeightScale		10

# Conversion of word weights and penalties to Q
WordWeightScale		10

# Words matched, but without positions get this penalty
BlindMatchPenalty	10

# Words with mismatched accents in accent mode 0 get this penalty
MisaccentPenalty	5

# The second best word score gets divided by this factor
SecondBestReduce	8

# Morphological variants get MorphPenalty, stem gets StemPenalty instead
StemPenalty		20
MorphPenalty		30

# Synonyma get this penalty
SynonymumPenalty	80

# When doing proximity matching, this much Q units are paid for each word position skipped
ProxPenalty		1000

# If the total weight of all words in a phrase minus all the penalties paid
# is less than this limit, the match is ignored.
ProxLimit		-5000

### Magic operations

# Try to find complexes across different components of simple search queries
MagicComplexes		1

# Try to merge the whole simple query to a single word, but restrict its class
#MagicMergeWords	1
#MagicMergeClasses	0x00080000
#MagicMergeBonus	1000

# Try to add near matching for the whole simple query, but at most
# for this number of words (0=off)
MagicNear		4

# Create at most MaxNears near-matchers per query
MaxNears		2

# When doing automatic near matches, we add NearBonusWord points per
# word and subtract NearPenaltyGap per inter-word gap.
# If the word is adjacent in the query with the previous seen word,
# we add NearBonusConnect points more.
NearBonusWord		700
NearPenaltyGap		500
NearBonusConnect	200

### Search server control script and keeper
[SKeeper]

# Number of retries when testing whether the daemon started responding to queries
TestRetry		180

# Delay in seconds between two retries
TestWait		1

# If the daemon comes up, make sure it really works by running this
# set of queries (the number at the start of the query are minimum
# numbers of responses)
#TestQuery		0 "linux"
#TestQuery		0 "martin"

# The daemon can be watched by a script called skeeper which restarts
# it in case of crash and notifies administrators by mail. Comment out
# Crash* if you don't want to run skeeper. You can use multiple mail
# addresses separated by spaces.
#CrashMail		sherlock@host2.netcentrum.cz

# Timing of restarts: if sherlockd ran for less than CrashWaitThreshold
# seconds and the timeout was at most CrashWaitCeiling, double the timeout.
#CrashWaitThreshold	60
#CrashWaitCeiling	300

# Command for rotation of logs (comment out to disable rotation), see utils/rotate-log.pl
RotateLogs		bin/rotate-log 1 14 log/sherlockd-* >/dev/null

# Files to store the process ID's in
DaemonPIDFile		lock/sherlockd.pid
KeeperPIDFile		lock/skeeper.pid

# Lock file used for index swaps and its timeout in seconds
SwapLock		lock/swaplock
SwapLockTimeout		10

### File transport utilities
[FileSend]

# Trace requests
Trace			1

# Compression level (0=none, 9=best; default: 0)
Compression		0

[FileRecv]

# Receive timeout (seconds; default: 0=unlimited)
Timeout			60

### Local settings
include cf/local
