A low-cost system for a PDF literature archiv I

Getting a scientific paper on your harddisk is quite simple. I am using a Fujitsu Scan Snap that can process a single page in a few seconds. The resulting PDF needs to be further tweaked by OCR recognition like ABBY FineReader (I couldn’t find any good open source alternative). FR will leave your PDF intact while adding recognized text as an overlay (or “underlay”). Unfortunately FR does not support batch processing but your OS will do by using a windows scripting engine like CLRscript. We also need a tool to extract a text file from the modified PDF. A good choice is pdftotext — look at the sourcecode and the DRM discussion before compiling it with a compiler like Cygwin. The following perl script doesn´t do anything than traversing your target directory and creating a batch file. As filenames offered by publishers are rather strange, I would first start to create some clean file names by replacing all spaces and brackets with something innocent like underscores.
perl.exe ocr.pl rename h:\pdf\2008\*.*
Now we create text files from the PDFs (usually done better by XPDF than directly by GDS).
perl.exe ocr.pl extract h:\pdf\2008\*.pdf
The resulting textfiles may be inspected: very small file sizes usually indicate no valid extraction and should be deleted before starting the OCR step as OCR is only done when text files are missing.
perl.exe ocr.pl ocr h:\pdf\2008\*.pdf
In the last step you may want to repeat the extract step.

ocr.zip

 1:
 2:
 3:
 4:
 5:
 6:
 7:
 8:
 9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
89:
# ocr.pl setting up batch ocr commands
# m@wjst.de 26Oct03
# ------------------------------------------------------------
#!/usr/local/bin/perl -w
$action = <$ARGV[0]>;
@files = <$ARGV[1]>;

open(OUT,">_ocr.cmd");
print OUT ("\@echo off\n");

foreach $file (@files) {
    if ($action eq "rename") {
        $old = $file;
        $file =~ s/(.*)(\.pdf|\.txt)//i;
        $new=;
        $new =~ s/[\(\)\s\%\.\#]/\_/gi;
        $new = $new . $file;
        if ($new ne $old) {
            print OUT "rename \"$old\" \"$new\" \n";
        }
    } 
    else {
        $file =~ m/(.*)(\.pdf)/i;
        $fn=;
        $fne=$fn."\.txt";
        if (not -e $fne) {
            if ($action eq "extract") {
                print OUT ("start /min /wait pdftotext.exe -layout -eol dos \"$fn\.pdf\"\n");
            }
            elsif ($action eq "ocr") {
                print OUT ("call:sub $fn\.pdf \& start /max /wait CLRScrpt.exe /r _ocr.tmp\n");
            }
        }
    }
}
if ($action eq "ocr") {
    print OUT ("del _ocr.tmp\>nul\n");
    CLRScrpt();
}
print OUT ("del _ocr.cmd\>nul\n");
close(OUT);
exit(0);

sub CLRScrpt {
print OUT << "EOF"
:: --------------------------------------------------------------
:sub
set o=_ocr.tmp
echo void main() { \>%o%
echo if (Run("c:\Programme\ABBYY FineReader 7.0 Professional Edition\FineReader.exe",SW_SHOWMAXIMIZED)); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("NO"); \>\>%o%
echo Pause (1000); \>\>%o%
echo SendKeys("{ctrl}o"); \>\>%o%
echo Pause (1000); \>\>%o%
echo SetDlgItemText(1152,"%1"); \>\>%o%
echo ClickButton("Ö&ffnen"); \>\>%o%
echo Pause (1000); \>\>%o%
echo while (VerifyActiveWindowTitleSub("Adding",1000)) \>\>%o%
echo { Pause (1000); } \>\>%o%
echo SendKeys("{ctrl}{shift}r"); \>\>%o%
echo Pause (1000); \>\>%o%
echo while (VerifyActiveWindowTitleSub("Reading",1000)) \>\>%o%
echo { Pause (1000); } \>\>%o%
echo SendKeys("{ctrl}{F2}"); \>\>%o%
echo Pause (1000); \>\>%o%
echo SetDlgItemText(1152,"%1"); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("For&mats Settings..."); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("Text &under the page image"); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("OK"); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("&Speichern"); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("OK"); \>\>%o%
echo while (VerifyActiveWindowTitleSub("Saving",1000)) \>\>%o%
echo { Pause (1000); } \>\>%o%
echo SendKeys("{alt}fx"); \>\>%o%
echo Pause (1000); \>\>%o%
echo ClickButton("&Nein"); \>\>%o%
echo exit(); \>\>%o%
echo } \>\>%o%
goto:eof
EOF
;
}