From c30814d5c3064d7b02a1457cda4a640ea66a278c Mon Sep 17 00:00:00 2001 From: David Vogel Date: Mon, 4 Nov 2024 18:22:37 +0100 Subject: [PATCH] Initial commit --- README.md | 18 ++++++ go.mod | 3 + init.go | 15 +++++ main.go | 8 +++ runner.go | 58 +++++++++++++++++++ service/linux/systemd/ocrmypdf-runner.service | 9 +++ 6 files changed, 111 insertions(+) create mode 100644 README.md create mode 100644 go.mod create mode 100644 init.go create mode 100644 main.go create mode 100644 runner.go create mode 100644 service/linux/systemd/ocrmypdf-runner.service diff --git a/README.md b/README.md new file mode 100644 index 0000000..96cf1b2 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# OCRmyPDF runner + +A very simple tool that listens for files in a directory, and runs OCRmyPDF on them. + +This is needed as paperless(-ngx) will always create a copy of the document with its built in clean up and OCR feature. +Even external pre-consumption scripts will be run on all new documents, not just files in from consumption directory. +So the solution is to have this watchdog/runner that only pre-processes scanned documents, and leaves everything else untouched. + +The idea is to let it watch a directory any scanner will scan into, and then this runner will write the final pre-processed document into a directory paperless watches. + +## Usage + +1. Install the project somewhere. +2. Edit [main.go](main.go) to use the correct paths to your scanner and paperless consumption directories. +3. Copy the [ocrmypdf-runner.service](service/linux/systemd/ocrmypdf-runner.service) into your paperless systemd services directory (`%HOME/.config/systemd/user/ocrmypdf-runner.service`). +4. `systemctl --user daemon-reload` +5. `systemctl --user enable ocrmypdf-runner.service` +6. `systemctl --user start ocrmypdf-runner.service` diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..886c39a --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module ocrmypdf-runner + +go 1.23.0 diff --git a/init.go b/init.go new file mode 100644 index 0000000..9105d74 --- /dev/null +++ b/init.go @@ -0,0 +1,15 @@ +package main + +import ( + "path/filepath" + "time" +) + +func init() { + runner := Runner{ + InputPatterns: []string{filepath.Join(".", "input", "*.pdf")}, + OutputPath: filepath.Join("."), + Interval: 5 * time.Second, + } + runner.Run() +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..7a72852 --- /dev/null +++ b/main.go @@ -0,0 +1,8 @@ +package main + +func main() { + // The runners will be started in some init.go file. + + // Wait forever. + select {} +} diff --git a/runner.go b/runner.go new file mode 100644 index 0000000..240d86e --- /dev/null +++ b/runner.go @@ -0,0 +1,58 @@ +package main + +import ( + "log" + "os" + "os/exec" + "path/filepath" + "time" +) + +const OCRMyPDFExecutable = "ocrmypdf" + +type Runner struct { + InputPatterns []string // The "Glob" patterns used for searching input files. + OutputPath string // The output directory passed to OCRmyPDF. + + Interval time.Duration +} + +func (r Runner) Run() { + if r.Interval == 0 { + r.Interval = 5 * time.Second + } + + go func() { + ticker := time.NewTicker(r.Interval) + for range ticker.C { + + for _, inputPattern := range r.InputPatterns { + filenames, err := filepath.Glob(inputPattern) + if err != nil { + log.Panicf("Failed to get input files: %v", err) + } + + for _, filename := range filenames { + + outputPath := filepath.Join(r.OutputPath, filepath.Base(filename)) + + args := []string{ + "-l deu+eng", + "--rotate-pages", + "--optimize 1", + } + args = append(args, filename, outputPath) + cmd := exec.Command(OCRMyPDFExecutable, args...) + + if err := cmd.Run(); err != nil { + log.Panicf("Failed to run OCRmyPDF: %v", err) + } + + // Only delete the PDF if the previous steps did succeed. + os.Remove(filename) + } + } + + } + }() +} diff --git a/service/linux/systemd/ocrmypdf-runner.service b/service/linux/systemd/ocrmypdf-runner.service new file mode 100644 index 0000000..b3dc327 --- /dev/null +++ b/service/linux/systemd/ocrmypdf-runner.service @@ -0,0 +1,9 @@ +[Unit] +Description=A runner that will watch directories and runs OCRmyPDF on files in them. + +[Service] +WorkingDirectory=/home/paperless/ocrmypdf-runner/ +ExecStart=go run . + +[Install] +WantedBy=default.target