mirror of
https://github.com/Dadido3/Scanyonero.git
synced 2025-06-06 17:30:00 +00:00
- Rename to Scanyonero - Add FTP server that ingests TIFF, PNG, JPEG or PDF files - Add web interface to check and modify ingested files - Rework how ocrmypdf is invoked Basics are working, but the program is not in a usable state.
139 lines
3.9 KiB
Go
139 lines
3.9 KiB
Go
package document
|
|
|
|
import (
|
|
"Scanyonero/unit"
|
|
"bytes"
|
|
"fmt"
|
|
"image"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/chai2010/tiff"
|
|
"github.com/pdfcpu/pdfcpu/pkg/api"
|
|
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
|
|
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
|
|
|
"image/jpeg"
|
|
"image/png"
|
|
)
|
|
|
|
// Ingestor contains all settings and rules for image/document file ingestion.
|
|
type Ingestor struct {
|
|
DefaultDPI unit.PerInch // Default/fallback dots per inch value.
|
|
|
|
Rules []IngestorRule
|
|
}
|
|
|
|
func (ingestor Ingestor) Ingest(file File) ([]Page, error) {
|
|
ext := filepath.Ext(file.Name)
|
|
|
|
var pages []Page
|
|
|
|
switch strings.ToLower(ext) {
|
|
case ".jpg", ".jpeg":
|
|
img, err := jpeg.Decode(bytes.NewReader(file.Data))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to decode JPEG file: %w", err)
|
|
}
|
|
dimensions := unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI)
|
|
if tag, err := decodeJFIF(bytes.NewReader(file.Data)); err == nil {
|
|
// Get more exact density info from the file metadata.
|
|
xDensity, yDensity := tag.Density()
|
|
dimensions = unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), xDensity, yDensity)
|
|
}
|
|
pages = append(pages, Page{
|
|
Image: img,
|
|
Dimensions: dimensions,
|
|
})
|
|
|
|
case ".png":
|
|
img, err := png.Decode(bytes.NewReader(file.Data))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to decode PNG file: %w", err)
|
|
}
|
|
pages = append(pages, Page{
|
|
Image: img,
|
|
Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI),
|
|
})
|
|
// TODO: Read pixel density metadata from PNG file
|
|
|
|
case ".tiff":
|
|
mm, _, err := tiff.DecodeAll(bytes.NewReader(file.Data))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to decode TIFF file: %w", err)
|
|
}
|
|
for _, m := range mm {
|
|
for _, img := range m {
|
|
pages = append(pages, Page{
|
|
Image: img,
|
|
Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI),
|
|
})
|
|
// TODO: Read pixel density metadata from TIFF file
|
|
}
|
|
}
|
|
|
|
case ".pdf":
|
|
conf := model.NewDefaultConfiguration()
|
|
conf.Cmd = model.EXTRACTIMAGES
|
|
ctx, err := api.ReadValidateAndOptimize(bytes.NewReader(file.Data), conf)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read and validate PDF file: %w", err)
|
|
}
|
|
|
|
boundaries, err := ctx.PageBoundaries(nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get page dimensions: %w", err)
|
|
}
|
|
if len(boundaries) != ctx.PageCount {
|
|
return nil, fmt.Errorf("number of retrieved page boundaries (%d) and pages (%d) differ", len(boundaries), ctx.PageCount)
|
|
}
|
|
|
|
for page := 1; page <= ctx.PageCount; page++ {
|
|
mm, err := pdfcpu.ExtractPageImages(ctx, page, false)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to extract image from page: %w", err)
|
|
}
|
|
if len(mm) == 0 {
|
|
return nil, fmt.Errorf("page %d doesn't contain any images", page)
|
|
}
|
|
if len(mm) > 1 {
|
|
return nil, fmt.Errorf("page %d contains %d images, expected 1", page, len(mm))
|
|
}
|
|
|
|
boundary := boundaries[page-1]
|
|
dim := boundary.Media.Rect.Dimensions().ToMillimetres()
|
|
dimX, dimY := unit.Millimeter(dim.Width), unit.Millimeter(dim.Height)
|
|
|
|
// Decode only image of the page.
|
|
for _, m := range mm {
|
|
img, _, err := image.Decode(m)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to decode %q file: %w", ext, err)
|
|
}
|
|
|
|
pages = append(pages, Page{
|
|
Image: img,
|
|
Dimensions: unit.NewPageDimensionsFromLengths(dimX, dimY),
|
|
})
|
|
|
|
break
|
|
}
|
|
|
|
}
|
|
|
|
default:
|
|
return nil, fmt.Errorf("unsupported file extension %q", ext)
|
|
}
|
|
|
|
for iPage := range pages {
|
|
page := &pages[iPage]
|
|
for i, rule := range ingestor.Rules {
|
|
if err := rule.Apply(ingestor, file, page); err != nil {
|
|
return nil, fmt.Errorf("failed to apply ingestor rule %d on page %d: %w", i, iPage, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return pages, nil
|
|
}
|