#!/bin/sh
# Sherlock Indexer Script
# (c) 2001--2006 Martin Mares <mj@ucw.cz>
# (c) 2003--2005 Robert Spalek <robert@ucw.cz>

function usage
{
	cat >&2 <<EOF
Usage: indexer [-12RUacd:fi:luvwC:S:] [<source> [<dest-dir>]]

-1	Stop after stage 1
-2	Start with stage 2 (needs -d3 at the last time)
-R	Try to resume an interrupted indexation with the same data source
-a	Ignore filters and accept all documents
-c	Only clean files and exit (set -d)
-d NUM	Delete files of level smaller than NUM (default=4)
	9=index, 8=incremental, 7=logs, 4=useful/short debug files, 3=labels,
	2=huge files, 1=really temporary files, 0=keep all versions
-f	Force deletion of the old index
-i DIR	Incremental scanner looks into this directory for a log file
-l	List index files between passes
-u	Upload feedback to the gatherer
-U	Do not upload feedback to the gatherer
-v	Be verbose (also enables progress indicators and tracing messages)
-vv	Be more verbose (and so on)
-w	Calculate weights and send gatherer feedback only
-W	Skip calculation of dynamic weights
-C, -S	Global configuration options passed to all programs
EOF
	exit 1
}

unset G LISTS STAGE1ONLY STAGE2ONLY WEIGHTSONLY CLEAN FORCE RESUME FEEDBACK NOWEIGHTS
DELETE=4
INCREMENTAL=
VERBOSE=0
set -e
while getopts "12RUWacd:fi:luvwC:S:" OPT ; do
	case "$OPT" in
	        1)	STAGE1ONLY=1
			;;
		2)	STAGE2ONLY=2
			;;
		R)	RESUME=1
			;;
		a)	G="$G -SIndexer.Filter="
			;;
		c)	CLEAN=1
			;;
		d)	DELETE=$OPTARG
			;;
		f)	FORCE=1
			;;
		i)	INCREMENTAL="-i $OPTARG"
			;;
		l)	LISTS=1
			;;
		u)	FEEDBACK=1
			;;
		U)	FEEDBACK=0
			;;
		v)	VERBOSE=$(($VERBOSE+1))
			;;
		w)	WEIGHTSONLY=1
			;;
		W)	NOWEIGHTS=1
			;;
		[CS])	G="$G -$OPT$OPTARG" ;;
		*)	usage
			;;
	esac
done
if [ $VERBOSE -gt 0 ] ; then
	G="$G -SIndexer.ProgressScreen=1 -SIndexer.Trace=$VERBOSE"
fi

# Gather option arguments, so that we can pass them again
OPTION_ARGS=
while [ $OPTIND -gt 1 ] ; do
	OPTION_ARGS="$OPTION_ARGS $1"
	shift
	OPTIND=$(($OPTIND-1))
done

if [ -n "$3" ] ; then
	usage
fi
if [ -n "$2" ] ; then
	G="$G -SIndexer.Directory=$2"
	OPTION_ARGS="$OPTION_ARGS -SIndexer.Directory=$2"
fi
if [ -n "$1" ] ; then
	G="$G -SIndexer.Source=$1"
fi

function log
{
	bin/logger indexer I "$1"
}

function die
{
	bin/logger indexer ! "$1"
	exit 1
}

function delete
{
	level=$1
	if [ "$level" -lt "$DELETE" ] ; then
		while [ -n "$2" ] ; do
			rm -f "$DIR/$2"
			shift
		done
	fi
}

function keep
{
	if [ "$DELETE" -le 0 ] ; then
		suffix=$1
		while [ -n "$2" ] ; do
			cp "$DIR/$2" "$DIR/$2.$suffix"
			shift
		done
	fi
}

function stats
{
	log "Disk usage $1: `du -s "$DIR" | cut -f 1` blocks"
	[ -z "$LISTS" ] || ( ls -Al "$DIR" | bin/logger indexer D )
}

function sizes
{
	bin/sizer $1/{card-attrs,cards,lexicon,references,string-map}
	total_index=`du -bs $1 | cut -f 1`
	bin/logger sizer I "total index size is $total_index"
}

function disconnect
{
	if [ ${CF_Indexer_Source:0:3} = "fd:" ] ; then
		bin/iconnect --disconnect $G
	fi
}

eval `bin/config $G 'Indexer{Directory=not/configured; Source=not/configured; LexByFreq; CardPrints; @SubIndex{Name; -#TypeMask; -#IdMask}}'`

SUBINDICES=`
	function get1 { echo -n " $1" ; }
	for s in "${CF_Indexer_SubIndex_Name[@]}" ; do
		get1 $s
	done
`
DIR="$CF_Indexer_Directory"

if [ -n "$RESUME" -a ${CF_Indexer_Source:0:3} != "fd:" ]; then
	if ! CF_Indexer_Source=`cat "$DIR/source" 2>/dev/null`; then
		die "Cannot find $DIR/source"
	fi
fi

if [ "$DELETE" -gt 2 ]; then
	G="$G -SIndexer.SortDeleteSrc=1"
fi

delete 0 attributes.* merges.*
delete 4 lexicon-by-freq
if [ -n "$CLEAN" ]; then
	delete 1 frame-graph image-graph
	delete 2 links-by-url ref-texts url-list-translated
	delete 2 lexicon-raw
	delete 3 labels-by-id labels
	delete 4 merges fingerprints checksums signatures link-graph link-graph-index keywords url-list
	delete 4 attributes notes notes-new
	delete 4 rank-vector rank2-vector
	delete 7 large-classes matches weights lexicon-classes
	delete 8 incremental
	if [ -n "$SUBINDICES" ] ; then
		for s in $SUBINDICES ; do
			delete 2 $s/string-index $s/word-index $s/lexicon-ordered $s/lexicon-words $s/stems-ordered
		done
	else
		delete 2 string-index word-index lexicon-ordered lexicon-words stems-ordered
	fi
	exit 0
fi

log "Building index from $CF_Indexer_Source in $DIR"
if [ -z "$STAGE2ONLY" ] ; then
	log "Deleting old index"
	mkdir -p "$DIR"
	ls "$DIR"/* >/dev/null 2>&1 \
	&& stty >/dev/null 2>&1 \
	&& if [ "$FORCE" != 1 ]
	then
		echo -n "Delete old index? (y/N) "
		read answ
		if [ "$answ" != y ]; then exit; fi
	fi
	rm -rf "$DIR"/*
fi
for s in $SUBINDICES ; do mkdir -p "$DIR/$s" ; done

if true; then
	log "Local data source is $CF_Indexer_Source"
	echo $CF_Indexer_Source >"$DIR/source"
fi

#ifndef CONFIG_BARE
if [ -n "$WEIGHTSONLY" ] ; then
	log "Simplified indexer run for weight calculation"
	bin/scanner $G -SIndexer.{LabelsByID,Checksums,Signatures,RefTexts}=-
	>"$DIR/labels-by-id"
	bin/fpsort $G
	bin/mkgraph $G
	bin/backlinker $G -1
	log "Weights calculated"
	exit 0
fi
#endif

if [ -z "$STAGE2ONLY" ] ; then
	bin/scanner $G $INCREMENTAL
	keep scanner attributes merges
#ifndef CONFIG_BARE
	bin/fpsort $G
	bin/mkgraph $G
	delete 2 links-by-url
#endif
#ifndef CONFIG_BARE
	bin/backlinker $G -1
	keep backlinker1 attributes merges
#endif
#ifdef CONFIG_BARE
	bin/merger $G
	bin/labelsort $G
#else
	bin/keywords $G
	bin/backlinker $G -2
	delete 1 frame-graph image-graph
	keep backlinker2 attributes merges
	bin/mergefp $G
	keep mergefp merges
	bin/mergesums $G
	keep mergesums merges
	bin/mergesigns $G
	keep mergesigns merges
	delete 4 signatures
	bin/merger $G
	bin/reftexts $G
	delete 2 ref-texts
	bin/labelsort $G
	bin/ireport $G
#endif
	stats "after first stage"
	delete 2 url-list-translated
	delete 3 labels-by-id
	delete 4 merges fingerprints checksums link-graph link-graph-index keywords url-list notes-new
	[ "$DELETE" -le 2 ] || stats "after cleanup"
	[ -z "$STAGE1ONLY" ] || exit 0
fi
bin/mklex $G
[ "$DELETE" -gt 4 ] || bin/lexfreq $G
bin/lexorder $G
delete 2 lexicon-raw
bin/chewer $G
disconnect
stats "after chewing"
delete 3 labels
delete 4 attributes notes
delete 7 large-classes matches weights lexicon-classes
delete 8 incremental

if [ -n "$SUBINDICES" ] ; then
	for s in $SUBINDICES ; do
		log "Processing subindex $s"
		ln -sf ../{lexicon-ordered,stems-ordered} "$DIR/$s/"
		SG="$G -SIndexer.Directory=$DIR/$s"
		bin/ssort $SG
		delete 2 $s/string-index
		bin/wsort $SG
		delete 2 $s/word-index $s/lexicon-ordered
		bin/lexsort $SG --optimize
		delete 2 $s/lexicon-words $s/stems-ordered
		[ -z "$CF_Indexer_CardPrints" ] || bin/psort $SG
		bin/seal $SG
		sizes "$DIR/$s"
	done
	delete 2 lexicon-ordered stems-ordered
else
	bin/ssort $G
	delete 2 string-index
	bin/wsort $G
	delete 2 word-index lexicon-ordered
	bin/lexsort $G
	delete 2 lexicon-words stems-ordered
	[ -z "$CF_Indexer_CardPrints" ] || bin/psort $G
	bin/seal $G
	sizes "$DIR"
fi

stats "after second stage"
log "Index built successfully."
