#!/bin/sh
# Distributed under the terms of the GNU General Public License v3
# AUTHOR: Magic Banana
# e-mail: lcerf@dcc.ufmg.br

# The output file name is basename of the last matching file + $SUFFIX.pdf
SUFFIX='-matches'

# If called with no argument, show usage help and exit
if [ -z "$1" ]
then
    echo "Usage: `basename $0` [OPTION]... FILE.pdf [OTHER_FILE.pdf]...
Concatenate the pages (of the PDF files) that contain patterns.
The patterns are read from the standard input and end with an empty line.
They are, by default, basic regular expression (BRE).

OPTION can be:
  -E, --extended-regexp     patterns are extended regular expressions (ERE)
  -F, --fixed-strings       patterns are fixed strings
  -P, --perl-regexp         patterns are Perl regular expressions
  -i, --ignore-case         ignore case distinctions
  -w, --word-regexp         force patterns to match only whole words
  -x, --line-regexp         force patterns to match only whole lines"
    exit
fi

# Create in /tmp a basename for temporary files
TMP=`mktemp -t pdf-page-grep.XXXXXX`
# Those temporary files are removed when the script exits
trap "rm $TMP* 2>/dev/null" 0
# Read a mandatory pattern
while [ -z "$pattern" ]
do
    echo -n 'pattern: '
    read pattern
done
while [ -n "$pattern" ]
do
    # Store the pattern in a temporary file
    echo "$pattern" >> $TMP
    # Read optional additional patterns
    echo -n 'OR pattern (empty to stop): '
    read pattern
done

# Concatenate, in the variable "options", the options
for arg in "$@"
do
    # An option is an argument starting with "-"
    if [ `expr substr "$arg" 1 1` = '-' ]
    then
	options="$options $arg"
    fi
done

# Process the PDF files
for arg in "$@"
do
    # Every non-option argument is a PDF file to process
    if [ `expr substr "$arg" 1 1` != '-' ]
    then
	echo -n "
matching pages in \"$arg\":"
	# Get its number of pages and enumerate them one by one
	nbOfPages=`pdfinfo "$arg" | awk '$1 == "Pages:" { print $2 }'`
	for page in `seq $nbOfPages`
	do
	    # Search for the presence of at least one pattern in the page
	    if pdftotext -f $page -l $page "$arg" - | grep$options -qf $TMP
	    then
		# The page matches: print its number
		echo -n " $page"
		# Remember the page
		if [ -n "$sel" ]
		then
		    sel=$sel,
		fi
		sel=$sel$page
	    fi
	done
	if [ -n "$sel" ]
	then
	    # The PDF file matches: increment the number of matching files
	    nb=`expr $nb + 1`
	    # Extract the matching pages into a numbered temporary file
	    pdfjam -q -o $TMP-$nb "$arg" $sel
	    # Reset the matching pages for the next PDF file to process
	    sel=''
	    # Remember the basename of this PDF file (last match until now)
	    out=`basename "$arg" .pdf`
	fi
    fi
done
echo '
'
if [ -z "$nb" ]
then
    # No PDF file matched: exit returning 1
    echo 'No match!'
    exit 1
fi
if [ $nb -eq 1 ]
then
    # One single file matched: move the matching pages to the output file
    echo '1 matching PDF file'
    mv $TMP-1 "$out$SUFFIX.pdf"
else
    # Concatenate all matching pages in the order they were found
    echo "$nb matching PDF files"
    pdfunite $TMP-`seq -s " $TMP-" $nb` "$out$SUFFIX.pdf"
fi
# Inform the output file name
echo "Output written to \"$out$SUFFIX.pdf\""