package document import ( "Scanyonero/unit" "bytes" "fmt" "image" "path/filepath" "strings" "github.com/chai2010/tiff" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" "image/jpeg" "image/png" ) // Ingestor contains all settings and rules for image/document file ingestion. type Ingestor struct { DefaultDPI unit.PerInch // Default/fallback dots per inch value. Rules []IngestorRule } func (ingestor Ingestor) Ingest(file File) ([]Page, error) { ext := filepath.Ext(file.Name) var pages []Page switch strings.ToLower(ext) { case ".jpg", ".jpeg": img, err := jpeg.Decode(bytes.NewReader(file.Data)) if err != nil { return nil, fmt.Errorf("failed to decode JPEG file: %w", err) } dimensions := unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI) if tag, err := decodeJFIF(bytes.NewReader(file.Data)); err == nil { // Get more exact density info from the file metadata. xDensity, yDensity := tag.Density() dimensions = unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), xDensity, yDensity) } pages = append(pages, Page{ Image: img, Dimensions: dimensions, }) case ".png": img, err := png.Decode(bytes.NewReader(file.Data)) if err != nil { return nil, fmt.Errorf("failed to decode PNG file: %w", err) } pages = append(pages, Page{ Image: img, Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI), }) // TODO: Read pixel density metadata from PNG file case ".tiff": mm, _, err := tiff.DecodeAll(bytes.NewReader(file.Data)) if err != nil { return nil, fmt.Errorf("failed to decode TIFF file: %w", err) } for _, m := range mm { for _, img := range m { pages = append(pages, Page{ Image: img, Dimensions: unit.NewPageDimensionsFromDensity(img.Bounds().Dx(), img.Bounds().Dy(), ingestor.DefaultDPI, ingestor.DefaultDPI), }) // TODO: Read pixel density metadata from TIFF file } } case ".pdf": conf := model.NewDefaultConfiguration() conf.Cmd = model.EXTRACTIMAGES ctx, err := api.ReadValidateAndOptimize(bytes.NewReader(file.Data), conf) if err != nil { return nil, fmt.Errorf("failed to read and validate PDF file: %w", err) } boundaries, err := ctx.PageBoundaries(nil) if err != nil { return nil, fmt.Errorf("failed to get page dimensions: %w", err) } if len(boundaries) != ctx.PageCount { return nil, fmt.Errorf("number of retrieved page boundaries (%d) and pages (%d) differ", len(boundaries), ctx.PageCount) } for page := 1; page <= ctx.PageCount; page++ { mm, err := pdfcpu.ExtractPageImages(ctx, page, false) if err != nil { return nil, fmt.Errorf("failed to extract image from page: %w", err) } if len(mm) == 0 { return nil, fmt.Errorf("page %d doesn't contain any images", page) } if len(mm) > 1 { return nil, fmt.Errorf("page %d contains %d images, expected 1", page, len(mm)) } boundary := boundaries[page-1] dim := boundary.Media.Rect.Dimensions().ToMillimetres() dimX, dimY := unit.Millimeter(dim.Width), unit.Millimeter(dim.Height) // Decode only image of the page. for _, m := range mm { img, _, err := image.Decode(m) if err != nil { return nil, fmt.Errorf("failed to decode %q file: %w", ext, err) } pages = append(pages, Page{ Image: img, Dimensions: unit.NewPageDimensionsFromLengths(dimX, dimY), }) break } } default: return nil, fmt.Errorf("unsupported file extension %q", ext) } for iPage := range pages { page := &pages[iPage] for i, rule := range ingestor.Rules { if err := rule.Apply(ingestor, file, page); err != nil { return nil, fmt.Errorf("failed to apply ingestor rule %d on page %d: %w", i, iPage, err) } } } return pages, nil }