mirror of
https://github.com/tomru/docscan.git
synced 2026-03-03 06:27:21 +01:00
83 lines
1.5 KiB
Bash
Executable File
83 lines
1.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -o errexit -o nounset
|
|
|
|
OCRMYPDF_CMD='docker run --rm -i ocrmypdf'
|
|
|
|
SRC_DIR=/mnt/docscans
|
|
DST_DIR=/mnt/documents
|
|
|
|
LOCKFILE=$SRC_DIR/.lock
|
|
ARCHIVE_DIR=$SRC_DIR/archive
|
|
FAILED_DIR=$SRC_DIR/failed
|
|
LOG_DIR=$SRC_DIR/logs
|
|
|
|
PROCESSED_COUNT=0
|
|
FAILED_FILES=
|
|
|
|
function ocrFile() {
|
|
SRC=$1
|
|
FILE_NAME=$(basename -- "$SRC")
|
|
|
|
DST_PDF=$DST_DIR/${FILE_NAME%.*}.pdf
|
|
DST_LOG=$LOG_DIR/${FILE_NAME%.*}.log
|
|
|
|
$OCRMYPDF_CMD \
|
|
-l deu \
|
|
--clean \
|
|
--rotate-pages \
|
|
--deskew \
|
|
- - \
|
|
< "$SRC" \
|
|
> "$DST_PDF" 2> "$DST_LOG"
|
|
}
|
|
|
|
exec 100>"$LOCKFILE" || exit 23
|
|
|
|
flock -n 100 || { echo "$LOCKFILE" is locked; exit 1; }
|
|
|
|
mkdir -p "$DST_DIR" "$ARCHIVE_DIR" "$FAILED_DIR" "$LOG_DIR"
|
|
|
|
while IFS= read -r -d '' FILE
|
|
do
|
|
echo "processing $FILE ..."
|
|
|
|
if ocrFile "$FILE"
|
|
then
|
|
echo processed "$FILE"
|
|
mv "$FILE" "$ARCHIVE_DIR"
|
|
echo moved "$FILE" to "$ARCHIVE_DIR"
|
|
(( PROCESSED_COUNT+=1 ))
|
|
else
|
|
echo failed to process "$FILE"
|
|
mv "$FILE" "$FAILED_DIR"
|
|
echo moved "$FILE" to "$FAILED_DIR"
|
|
FAILED_FILES+=${FILE}\n
|
|
fi
|
|
echo
|
|
done < <(find "$SRC_DIR" -maxdepth 1 -name '*.pdf' -print0)
|
|
|
|
echo Done
|
|
echo
|
|
echo processed "$PROCESSED_COUNT" PDFs
|
|
|
|
if [ -n "$FAILED_FILES" ]
|
|
then
|
|
echo failed on PDFs
|
|
echo "$FAILED_FILES"
|
|
|
|
sendmail -t thomasruoff@gmail.com <<MAILTEXT
|
|
Subject: [$hostname] OCR on PDFs failed
|
|
|
|
Hey,
|
|
following PDFs failed to be OCR'd:
|
|
|
|
${FAILED_FILES}
|
|
|
|
Cheers
|
|
|
|
MAILTEXT
|
|
|
|
fi
|
|
|