#!/bin/sh
# Sherlock Indexer Script
# (c) 2001--2004 Martin Mares <mj@ucw.cz>
# (c) 2003--2004 Robert Spalek <robert@ucw.cz>

function usage
{
	cat >&2 <<EOF
Usage: indexer [-12acd:lv] [<source> [<dest-dir>]]

-1	Stop after stage 1
-2	Start with stage 2 (needs -d3 at the last time)
-a	Ignore filters and accept all documents
-c	Only clean files and exit (set -d)
-d NUM	Delete files of level smaller than NUM (default=4)
	9=index, 7=logs, 4=useful/short debug files, 3=labels,
	2=huge files, 1=really temporary files, 0=keep all versions
-f	Force deletion of the old index
-l	List index files between passes
-v	Be verbose (also enables progress indicators)
-w	Calculate weights and send gatherer feedback only
-C, -S	Global configuration options passed to all programs
EOF
	exit 1
}

unset VERBOSE G LISTS STAGE1ONLY STAGE2ONLY WEIGHTSONLY CLEAN FORCE
DELETE=4
set -e
while getopts "12acd:flvwC:S:" OPT ; do
	case "$OPT" in
	        1)	STAGE1ONLY=1
			;;
		2)	STAGE2ONLY=2
			;;
		a)	G="$G -SIndexer.Filter="
			;;
		c)	CLEAN=1
			;;
		d)	DELETE=$OPTARG
			;;
		f)	FORCE=1
			;;
		l)	LISTS=1
			;;
		v)	VERBOSE=1
			G="$G -SIndexer.ProgressScreen=1"
			;;
		w)	WEIGHTSONLY=1
			;;
		[CS])	G="$G -$OPT$OPTARG" ;;
		*)	usage
			;;
	esac
done
# Gather option arguments, so that we can pass them again
OPTION_ARGS=
while [ $OPTIND -gt 1 ] ; do
	OPTION_ARGS="$OPTION_ARGS $1"
	shift
	OPTIND=$(($OPTIND-1))
done

if [ -n "$3" ] ; then
	usage
fi
if [ -n "$2" ] ; then
	G="$G -SIndexer.Directory=$2"
	OPTION_ARGS="$OPTION_ARGS -SIndexer.Directory=$2"
fi
if [ -n "$1" ] ; then
	G="$G -SIndexer.Source=$1"
fi

function log
{
	bin/logger indexer I "$1"
}

function delete
{
	level=$1
	if [ "$level" -lt "$DELETE" ] ; then
		while [ -n "$2" ] ; do
			rm -f $CF_Directory/$2
			shift
		done
	fi
}

function keep
{
	if [ "$DELETE" -le 0 ] ; then
		suffix=$1
		while [ -n "$2" ] ; do
			cp $CF_Directory/$2 $CF_Directory/$2.$suffix
			shift
		done
	fi
}

function stats
{
	log "Disk usage $1: `du -s $CF_Directory | cut -f 1` blocks"
	[ -z "$LISTS" ] || ( ls -Al $CF_Directory | bin/logger indexer D )
}

function sizes
{
	bin/sizer $CF_Directory/{card-attrs,cards,lexicon,references,string-map}
	total_index=`du -bs $CF_Directory | cut -f 1`
	echo "total index size is $total_index" | bin/logger sizer I
}

function disconnect
{
	if [ ${CF_Source:0:3} = "fd:" ] ; then
		bin/iconnect --disconnect $G
	fi
}

eval `bin/config $G Indexer Directory=not/configured Source=not/configured LexByFreq CardPrints '*'`

if [ ${CF_Source:0:7} = "remote:" ] ; then
	exec bin/iconnect $CF_Source bin/indexer$OPTION_ARGS
fi

if [ "$DELETE" -gt 2 ]; then
	G="$G -SIndexer.SortDeleteSrc=1"
fi

delete 0 attributes.* merges.*
delete 4 lexicon-by-freq
if [ -n "$CLEAN" ]; then
	delete 1 frame-graph image-graph
	delete 2 links-by-url ref-texts url-list-translated
	delete 2 lexicon-raw string-index word-index lexicon-ordered lexicon-words stems-ordered
	delete 3 labels-by-id labels
	delete 4 merges fingerprints checksums signatures link-graph link-graph-index keywords url-list
	delete 4 attributes notes notes-new
	delete 7 large-classes matches sites weights lexicon-classes
	exit 0
fi

log "Building index from $CF_Source in $CF_Directory"
if [ -z "$STAGE2ONLY" ] ; then
	log "Deleting old index"
	mkdir -p $CF_Directory
	ls $CF_Directory/* >/dev/null 2>&1 \
	&& stty >/dev/null 2>&1 \
	&& if [ "$FORCE" != 1 ]
	then
		echo -n "Delete old index? (y/N) "
		read answ
		if [ "$answ" != y ]; then exit; fi
	fi
	rm -f $CF_Directory/*
fi

#ifndef CONFIG_BARE
if [ -n "$WEIGHTSONLY" ] ; then
	log "Simplified indexer run for weight calculation"
	bin/scanner $G -SIndexer.{LabelsByID,Checksums,Signatures,RefTexts}=-
	>$CF_Directory/labels-by-id
	bin/fpsort $G
	bin/mkgraph $G
	bin/backlinker $G -1
	log "Weights calculated"
	exit 0
fi
#endif

if [ -z "$STAGE2ONLY" ] ; then
	bin/scanner $G
	keep scanner attributes merges
#ifndef CONFIG_BARE
	bin/fpsort $G
	bin/mkgraph $G
#endif
#ifndef CONFIG_BARE
	bin/backlinker $G -1
	keep backlinker1 attributes merges
#endif
#ifdef CONFIG_BARE
	>$CF_Directory/labels
#else
	bin/keywords $G
	bin/backlinker $G -2
	delete 1 frame-graph image-graph
	keep backlinker2 attributes merges
	bin/mergefp $G
	keep mergefp merges
	bin/mergesums $G
	keep mergesums merges
	bin/mergesigns $G
	keep mergesigns merges
	bin/merger $G
	bin/reftexts $G
	bin/labelsort $G
	bin/ireport $G
#endif
	stats "after first stage"
	delete 2 links-by-url ref-texts url-list-translated
	delete 3 labels-by-id
	delete 4 merges fingerprints checksums signatures link-graph link-graph-index keywords url-list notes-new
	[ "$DELETE" -le 2 ] || stats "after cleanup"
	[ -z "$STAGE1ONLY" ] || exit 0
fi
bin/mklex $G
[ "$DELETE" -gt 4 ] || bin/lexfreq $G
bin/lexorder $G
delete 2 lexicon-raw
bin/chewer $G
disconnect
stats "after chewing"
delete 3 labels
delete 4 attributes notes
bin/ssort $G
delete 2 string-index
bin/wsort $G
delete 2 word-index lexicon-ordered
bin/lexsort $G
delete 2 lexicon-words stems-ordered
delete 7 large-classes matches sites weights lexicon-classes
[ -z "$CF_CardPrints" ] || bin/psort $G
stats "after second stage"
sizes
bin/seal $G
log "Index built successfully."
