Scanyonero/document/ingestor.go
David Vogel 853a1bb58d Rework into FTP scanning server
- Rename to Scanyonero
- Add FTP server that ingests TIFF, PNG, JPEG or PDF files
- Add web interface to check and modify ingested files
- Rework how ocrmypdf is invoked

Basics are working, but the program is not in a usable state.
2025-05-14 12:08:38 +02:00

139 lines
3.9 KiB
Go

package document
import (
"Scanyonero/unit"
"bytes"
"fmt"
"image"
"path/filepath"
"strings"
"github.com/chai2010/tiff"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"image/jpeg"
"image/png"
)
// Ingestor contains all settings and rules for image/document file ingestion.
type Ingestor struct {
DefaultDPI unit.PerInch // Default/fallback dots per inch value.
Rules []IngestorRule
}
func (ingestor Ingestor) Ingest(file File) ([]Page, error) {
ext := filepath.Ext(file.Name)
var pages []Page
switch strings.ToLower(ext) {
case ".jpg", ".jpeg":
img, err := jpeg.Decode(bytes.NewReader(file.Data))
if err != nil {
return nil, fmt.Errorf("failed to decode JPEG file: %w", err)
}
dimensions := unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI)
if tag, err := decodeJFIF(bytes.NewReader(file.Data)); err == nil {
// Get more exact density info from the file metadata.
xDensity, yDensity := tag.Density()
dimensions = unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), xDensity, yDensity)
}
pages = append(pages, Page{
Image: img,
Dimensions: dimensions,
})
case ".png":
img, err := png.Decode(bytes.NewReader(file.Data))
if err != nil {
return nil, fmt.Errorf("failed to decode PNG file: %w", err)
}
pages = append(pages, Page{
Image: img,
Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI),
})
// TODO: Read pixel density metadata from PNG file
case ".tiff":
mm, _, err := tiff.DecodeAll(bytes.NewReader(file.Data))
if err != nil {
return nil, fmt.Errorf("failed to decode TIFF file: %w", err)
}
for _, m := range mm {
for _, img := range m {
pages = append(pages, Page{
Image: img,
Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI),
})
// TODO: Read pixel density metadata from TIFF file
}
}
case ".pdf":
conf := model.NewDefaultConfiguration()
conf.Cmd = model.EXTRACTIMAGES
ctx, err := api.ReadValidateAndOptimize(bytes.NewReader(file.Data), conf)
if err != nil {
return nil, fmt.Errorf("failed to read and validate PDF file: %w", err)
}
boundaries, err := ctx.PageBoundaries(nil)
if err != nil {
return nil, fmt.Errorf("failed to get page dimensions: %w", err)
}
if len(boundaries) != ctx.PageCount {
return nil, fmt.Errorf("number of retrieved page boundaries (%d) and pages (%d) differ", len(boundaries), ctx.PageCount)
}
for page := 1; page <= ctx.PageCount; page++ {
mm, err := pdfcpu.ExtractPageImages(ctx, page, false)
if err != nil {
return nil, fmt.Errorf("failed to extract image from page: %w", err)
}
if len(mm) == 0 {
return nil, fmt.Errorf("page %d doesn't contain any images", page)
}
if len(mm) > 1 {
return nil, fmt.Errorf("page %d contains %d images, expected 1", page, len(mm))
}
boundary := boundaries[page-1]
dim := boundary.Media.Rect.Dimensions().ToMillimetres()
dimX, dimY := unit.Millimeter(dim.Width), unit.Millimeter(dim.Height)
// Decode only image of the page.
for _, m := range mm {
img, _, err := image.Decode(m)
if err != nil {
return nil, fmt.Errorf("failed to decode %q file: %w", ext, err)
}
pages = append(pages, Page{
Image: img,
Dimensions: unit.NewPageDimensionsFromLengths(dimX, dimY),
})
break
}
}
default:
return nil, fmt.Errorf("unsupported file extension %q", ext)
}
for iPage := range pages {
page := &pages[iPage]
for i, rule := range ingestor.Rules {
if err := rule.Apply(ingestor, file, page); err != nil {
return nil, fmt.Errorf("failed to apply ingestor rule %d on page %d: %w", i, iPage, err)
}
}
}
return pages, nil
}