# Configuration of various library modules and utilities (included by cf/sherlock)

######## Memory Mapped Access to Files ##########################################

# Whenever you specify 0 for I/O buffer size, memory mapping is used instead.
FBMMap {

# Map this many bytes at once (needs to be a multiple of CPU page size)
WindowSize		1M

# When in need to extend a file, grow it by so many bytes (>= page size)
ExtendSize		1M

}

######## Temporary files ########################################################

Tempfiles {

# Filename prefix for temporary files ("pid(-tid)-counter" is appended)
# The directory should not be writeable by malicious users.
Prefix			tmp/temp

}

######## Threads ################################################################

Threads {

# Default thread stack size
DefaultStackSize	64K

}

######## Sorter #################################################################

Sorter {

# Trace sorting (print pass statistics etc.)
Trace			1

# Buffer used for presorting
PresortBuffer		64K

# Per-stream buffer (0=use mmap)
StreamBuffer		64K

}

######## URL processing #########################################################

URL {

# Ignore spaces at the start/end of a URL
IgnoreSpaces		1

# Ignore underflows in relative paths (/../ from root)
IgnoreUnderflow		1

# Some URL's with many repeated components are filtered out to avoid infinite
# URL's (e.g. http://czech.recoder.cz/win/iso/win/iso/file.html, or
# http://a.com/?a=b&a=b&a=b, ...).
# The URL is split to components divided by any of the specified separators.
# Then the separators are forgotten and the components between them are
# examined.
ComponentSeparators	/&?

# URL is filtered out if there's a sequence of components in a row with at most
# MaxRepeatLength components and the sequence is repeated more than MinRepeatCount
# times.  Default values are high MinRepeatCount and low MaxRepeatLength, so the
# mechanism is disabled.
MinRepeatCount		4
MaxRepeatLength		4

}

######## Buckets ################################################################

Buckets {

# Default name of the bucket file
BucketFile		db/objects

# Size of I/O buffer
BufSize			64K

# Size of shakedown buffer. The largest bucket in the file must fit there.
ShakeBufSize		2M

# Shakedown security: 0=low (fastest, but system crash can cause loss of some
# buckets), 1=safe (use a backup buffer of ShakeBufSize bytes on disk, only
# marginally slower), 2=synchronous (fsync after each block written)
ShakeSecurity		2

# Size of I/O buffer for reads of the whole bucket file (0=use mmap)
SlurpBufSize		64K

# Number of bytes to prefetch when reading a random bucket
PrefetchSize		8K

}

######## The filter engine ######################################################

Filter {

# Trace compiling, optimizing, and interpreting filters
Trace			0

# SWITCH commands with more than HASHLIMIT equality tests of strings (operators
# == and ===) are optimized using hash-tables.  TrieLimit is for building tries
# for operators =* and =** and TreeLimit is for building binary search trees for
# operators =# and =##.
HashLimit		4
TrieLimit		4
TreeLimit		4

# Perform various optimizations when parsing filters (see doc/filter)
Optimize		1

# If set, the optimized filter is dumped into the given text file
#DumpFilterTo		tmp/optimized-filter

}

######## Character classes (shared by indexer and sherlockd) ####################

Alphabet {

# Characters can be of the following types:
#	Space		space separating words
#	Alpha		alphabetical character forming words
#	Punct		punctuation separating words
#	Break		punctuation separating sentences
#	Singleton	single-character words
#	Inherit		inherit properties from UniCode categories
# by default, all characters are of type Space.
# Format: array of characters, unicode values, and/or unicode ranges

Inherit			0000-02AF 0400-052F FB00-FB06
Singleton		@ & + 00A7
Break			. ? !
Alpha			0023

}

######## Data dumpers ###########################################################
# (idxdump, objdump etc.)

Dumper {

# Charset using for output on terminal (default: utf-8)
TerminalCharset         iso-8859-2

# Width of terminal (X attributes are formatted to this right margin)
TerminalWidth		78

}

######## Shcp -- the file copying utility #######################################

Shcp {

# Command to invoke a ssh client
SshProgram		ssh

# After connecting to a remote host via ssh, we change the current directory to RunDir
# and invoke ShcpProgram in client mode.
RunDir			~/run
ShcpProgram		bin/shcp

# Buffer size for socket read/write
SockBufferSize		64K

# Buffer size for file read/write (must be a multiple of SockBufferSize, with DirectFileAccess must be a multiple of 512)
FileBufferSize		64K

# Higher verbose level
Trace			0

# Use uncached disk access
DirectFileAccess	0

# Use file transfer speed limit (in MB/s, 0 = no limit)
Limit			0

# Header connection timeout (seconds; 0=unlimited)
HeaderTimeout		60

# Receive timeout (seconds; 0=unlimited)
Timeout			60

# List of ssh options
#SshOption		'-A' '-p 22'

}

######## Generation of URL keys #################################################

URLKey {

# Consider http://www.(.*) and http://\1 equivalent. This is a dirty hack which
# should be used if you remove "www." from URL keys in the filters. Will be
# replaced by better handling of duplicates in the gatherer.
WWWHack			1

# Table of equivalent URL prefixes.
PrefixTable		cf/url-equiv

}
