Initial commit

This commit is contained in:
David Vogel 2024-11-04 18:22:37 +01:00
commit c30814d5c3
6 changed files with 111 additions and 0 deletions

18
README.md Normal file
View File

@ -0,0 +1,18 @@
# OCRmyPDF runner
A very simple tool that listens for files in a directory, and runs OCRmyPDF on them.
This is needed as paperless(-ngx) will always create a copy of the document with its built in clean up and OCR feature.
Even external pre-consumption scripts will be run on all new documents, not just files in from consumption directory.
So the solution is to have this watchdog/runner that only pre-processes scanned documents, and leaves everything else untouched.
The idea is to let it watch a directory any scanner will scan into, and then this runner will write the final pre-processed document into a directory paperless watches.
## Usage
1. Install the project somewhere.
2. Edit [main.go](main.go) to use the correct paths to your scanner and paperless consumption directories.
3. Copy the [ocrmypdf-runner.service](service/linux/systemd/ocrmypdf-runner.service) into your paperless systemd services directory (`%HOME/.config/systemd/user/ocrmypdf-runner.service`).
4. `systemctl --user daemon-reload`
5. `systemctl --user enable ocrmypdf-runner.service`
6. `systemctl --user start ocrmypdf-runner.service`

3
go.mod Normal file
View File

@ -0,0 +1,3 @@
module ocrmypdf-runner
go 1.23.0

15
init.go Normal file
View File

@ -0,0 +1,15 @@
package main
import (
"path/filepath"
"time"
)
func init() {
runner := Runner{
InputPatterns: []string{filepath.Join(".", "input", "*.pdf")},
OutputPath: filepath.Join("."),
Interval: 5 * time.Second,
}
runner.Run()
}

8
main.go Normal file
View File

@ -0,0 +1,8 @@
package main
func main() {
// The runners will be started in some init.go file.
// Wait forever.
select {}
}

58
runner.go Normal file
View File

@ -0,0 +1,58 @@
package main
import (
"log"
"os"
"os/exec"
"path/filepath"
"time"
)
const OCRMyPDFExecutable = "ocrmypdf"
type Runner struct {
InputPatterns []string // The "Glob" patterns used for searching input files.
OutputPath string // The output directory passed to OCRmyPDF.
Interval time.Duration
}
func (r Runner) Run() {
if r.Interval == 0 {
r.Interval = 5 * time.Second
}
go func() {
ticker := time.NewTicker(r.Interval)
for range ticker.C {
for _, inputPattern := range r.InputPatterns {
filenames, err := filepath.Glob(inputPattern)
if err != nil {
log.Panicf("Failed to get input files: %v", err)
}
for _, filename := range filenames {
outputPath := filepath.Join(r.OutputPath, filepath.Base(filename))
args := []string{
"-l deu+eng",
"--rotate-pages",
"--optimize 1",
}
args = append(args, filename, outputPath)
cmd := exec.Command(OCRMyPDFExecutable, args...)
if err := cmd.Run(); err != nil {
log.Panicf("Failed to run OCRmyPDF: %v", err)
}
// Only delete the PDF if the previous steps did succeed.
os.Remove(filename)
}
}
}
}()
}

View File

@ -0,0 +1,9 @@
[Unit]
Description=A runner that will watch directories and runs OCRmyPDF on files in them.
[Service]
WorkingDirectory=/home/paperless/ocrmypdf-runner/
ExecStart=go run .
[Install]
WantedBy=default.target