From c30814d5c3064d7b02a1457cda4a640ea66a278c Mon Sep 17 00:00:00 2001
From: David Vogel <Dadido3@aol.com>
Date: Mon, 4 Nov 2024 18:22:37 +0100
Subject: [PATCH] Initial commit

---
 README.md                                     | 18 ++++++
 go.mod                                        |  3 +
 init.go                                       | 15 +++++
 main.go                                       |  8 +++
 runner.go                                     | 58 +++++++++++++++++++
 service/linux/systemd/ocrmypdf-runner.service |  9 +++
 6 files changed, 111 insertions(+)
 create mode 100644 README.md
 create mode 100644 go.mod
 create mode 100644 init.go
 create mode 100644 main.go
 create mode 100644 runner.go
 create mode 100644 service/linux/systemd/ocrmypdf-runner.service

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..96cf1b2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,18 @@
+# OCRmyPDF runner
+
+A very simple tool that listens for files in a directory, and runs OCRmyPDF on them.
+
+This is needed as paperless(-ngx) will always create a copy of the document with its built in clean up and OCR feature.
+Even external pre-consumption scripts will be run on all new documents, not just files in from consumption directory.
+So the solution is to have this watchdog/runner that only pre-processes scanned documents, and leaves everything else untouched.
+
+The idea is to let it watch a directory any scanner will scan into, and then this runner will write the final pre-processed document into a directory paperless watches.
+
+## Usage
+
+1. Install the project somewhere.
+2. Edit [main.go](main.go) to use the correct paths to your scanner and paperless consumption directories.
+3. Copy the [ocrmypdf-runner.service](service/linux/systemd/ocrmypdf-runner.service) into your paperless systemd services directory (`%HOME/.config/systemd/user/ocrmypdf-runner.service`).
+4. `systemctl --user daemon-reload`
+5. `systemctl --user enable ocrmypdf-runner.service`
+6. `systemctl --user start ocrmypdf-runner.service`
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..886c39a
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module ocrmypdf-runner
+
+go 1.23.0
diff --git a/init.go b/init.go
new file mode 100644
index 0000000..9105d74
--- /dev/null
+++ b/init.go
@@ -0,0 +1,15 @@
+package main
+
+import (
+	"path/filepath"
+	"time"
+)
+
+func init() {
+	runner := Runner{
+		InputPatterns: []string{filepath.Join(".", "input", "*.pdf")},
+		OutputPath:    filepath.Join("."),
+		Interval:      5 * time.Second,
+	}
+	runner.Run()
+}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..7a72852
--- /dev/null
+++ b/main.go
@@ -0,0 +1,8 @@
+package main
+
+func main() {
+	// The runners will be started in some init.go file.
+
+	// Wait forever.
+	select {}
+}
diff --git a/runner.go b/runner.go
new file mode 100644
index 0000000..240d86e
--- /dev/null
+++ b/runner.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"time"
+)
+
+const OCRMyPDFExecutable = "ocrmypdf"
+
+type Runner struct {
+	InputPatterns []string // The "Glob" patterns used for searching input files.
+	OutputPath    string   // The output directory passed to OCRmyPDF.
+
+	Interval time.Duration
+}
+
+func (r Runner) Run() {
+	if r.Interval == 0 {
+		r.Interval = 5 * time.Second
+	}
+
+	go func() {
+		ticker := time.NewTicker(r.Interval)
+		for range ticker.C {
+
+			for _, inputPattern := range r.InputPatterns {
+				filenames, err := filepath.Glob(inputPattern)
+				if err != nil {
+					log.Panicf("Failed to get input files: %v", err)
+				}
+
+				for _, filename := range filenames {
+
+					outputPath := filepath.Join(r.OutputPath, filepath.Base(filename))
+
+					args := []string{
+						"-l deu+eng",
+						"--rotate-pages",
+						"--optimize 1",
+					}
+					args = append(args, filename, outputPath)
+					cmd := exec.Command(OCRMyPDFExecutable, args...)
+
+					if err := cmd.Run(); err != nil {
+						log.Panicf("Failed to run OCRmyPDF: %v", err)
+					}
+
+					// Only delete the PDF if the previous steps did succeed.
+					os.Remove(filename)
+				}
+			}
+
+		}
+	}()
+}
diff --git a/service/linux/systemd/ocrmypdf-runner.service b/service/linux/systemd/ocrmypdf-runner.service
new file mode 100644
index 0000000..b3dc327
--- /dev/null
+++ b/service/linux/systemd/ocrmypdf-runner.service
@@ -0,0 +1,9 @@
+[Unit]
+Description=A runner that will watch directories and runs OCRmyPDF on files in them.
+
+[Service]
+WorkingDirectory=/home/paperless/ocrmypdf-runner/
+ExecStart=go run .
+
+[Install]
+WantedBy=default.target