// -*- C++ -*- /* Copyright 2007 Deutsches Forschungszentrum fuer Kuenstliche Intelligenz or its licensors, as applicable. You may not use this file except under the terms of the accompanying license. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http: www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Project: File: Purpose: Responsible: tmb Reviewer: Primary Repository: Web Sites: www.iupr.org, www.dfki.de */ $#include "math.h" $#include "colib.h" $#include "imgio.h" $#include "imglib.h" $#include "ocr-layout-rast.h" $//#include "layout-rules.h" $//#include "ocr-recognize-page.h" $#include "ocr-binarize-sauvola.h" $#include "ocr-doc-clean.h" $#include "ocr-deskew-rast.h" $#include "ocr-text-image-seg.h" $#include "ocr-text-image-seg-leptonica.h" $#include "voronoi-ocropus.h" $#include "ocr-pageseg-xycut.h" $#include "langmod-shortest-path.h" $#include "segmentation.h" $#include "ocr-utils.h" $#include "ocr-segmentations.h" $#include "bpnet.h" $#include "ocrcomponents.h" $#include "ocr-util.h" $#include "regionextractor.h" $#include "pages.h" $#include "lines.h" $#include "grouper.h" $#include "grouping.h" $#include "charlib.h" $#ifdef HAVE_TESSERACT $#include "recognized-page.h" $#endif $#include "narray-io.h" $#include "lattice.h" $#include "beam-search.h" $#include "a-star.h" $#include "classify-chars.h" $#include "didegrade.h" $#include "logger.h" $#include "idmap.h" $#include "xml-entities.h" $#include "editdist.h" $#include "fst-io.h" $#include "fst-em.h" $//#include "ocr-recognize-line.h" $//#include "ocr-recognize-page.h" $#ifdef HAVE_FST $#include "fstutil.h" $#include "fstbuilder.h" $#include "fstmodels.h" $#endif $#include "ocr-word-segmentation.h" $#include "line-info.h" $#include "mnist.h" $using namespace ocropus; $using namespace colib; $using namespace iulib; module ocr { struct point { int x; int y; point(); point(int x,int y); }; // ocr-utils void make_line_segmentation_black(intarray &image); void make_line_segmentation_white(intarray &image); void make_page_segmentation_black(intarray &image); void make_page_segmentation_white(intarray &image); // OCRopus objects class idmap { idmap(); void segments_of_id(intarray &result,int id); void ids_of_segment(intarray &result,int segment); void associate(int id, int segment); void clear(); }; struct IComponent { virtual ~IComponent() {} virtual const char *description() = 0; virtual void set(const char *,const char *); virtual void set(const char *,double); }; struct ICleanupGray : IComponent { virtual void cleanup(bytearray &out,bytearray &in) = 0; }; struct ICleanupBinary : IComponent { virtual void cleanup(bytearray &out,bytearray &in) = 0; }; struct ISegmentPage : IComponent { virtual void segment(intarray &out,bytearray &in); virtual void segment(intarray &out,bytearray &in,rectarray &obstacles); }; struct ISegmentLine : IComponent { virtual void charseg(intarray &out,bytearray &in) = 0; }; struct IBinarize : IComponent { virtual void binarize(bytearray &out,bytearray &in) = 0; virtual void binarize(bytearray &out,floatarray &in) = 0; }; struct ITextImageClassification : IComponent { virtual void textImageProbabilities(intarray &out,bytearray &in) = 0; }; struct IGenericFst : IComponent { void clear(); int newState(); void addTransition(int from,int to,int output,float cost,int input); void setStart(int node); void setAccept(int node,float f=0.0); int special(const char *s); void bestpath(nustring &result); void setString(nustring &result,floatarray &costs,intarray &ids); virtual int nStates(); virtual int getStart(); virtual float getAcceptCost(int node); virtual void arcs(intarray &ids, intarray &targets, intarray &outputs, floatarray &costs, int from); void save(const char *path); void load(const char *path); }; struct ICharacterClassifier : IComponent { virtual void setImage(bytearray &input_image); virtual void setImage(bytearray &image,int base_y, int xheight_y, int descender_y, int ascender_y); virtual int length(); virtual void cls(nustring &result, int i); virtual float cost(int i); virtual void startTraining(const char *type="adaptation"); virtual void addTrainingChar(bytearray &input_image,nustring &characters); virtual void addTrainingChar(bytearray &image,int base_y, int xheight_y, int descender_y, int ascender_y,nustring &characters); virtual void addTrainingChar(bytearray &image,bytearray &mask, nustring &characters); virtual void finishTraining(); void save(const char *path); void load(const char *path); virtual void best(nustring &result); virtual ~ICharacterClassifier() {} }; struct IRecognizeLine : IComponent { virtual void recognizeLine(IGenericFst &result,bytearray &image); virtual void startTraining(const char *type="adaptation"); virtual void addTrainingLine(bytearray &image,nustring &transcription); virtual void addTrainingLine(intarray &segmentation, bytearray &image_grayscale, nustring &transcription); virtual void finishTraining(); virtual void save(const char *stream); virtual void load(const char *stream); virtual ~IRecognizeLine() {} virtual void recognizeLine(intarray &segmentation,IGenericFst &result, bytearray &image); virtual void align(nustring &chars,intarray &result,floatarray &costs, bytearray &image,IGenericFst &transcription); }; struct IBestPath { virtual void bestpath(nustring &result,floatarray &costs,intarray &ids,intarray &states) = 0; }; struct ISearchableFst : IGenericFst, IBestPath {}; ISearchableFst *make_ShortestPathSearchableFst(); void binarize_by_range(bytearray &image,float fraction=0.5); void binarize_by_range(bytearray &out,floatarray &in,float fraction=0.5); IBinarize *make_BinarizeByRange(); IBinarize *make_BinarizeByOtsu(); IBinarize *make_BinarizeBySauvola(); ICleanupBinary *make_DeskewPageByRAST(); // FIXME ICleanupGray *make_DeskewGrayByRAST(); double estimate_skew_by_rast(bytearray &in); ICleanupBinary *make_RemoveImageRegionsBinary(); ICleanupBinary *make_DocClean(); ICleanupGray *make_RemoveImageRegionsGray(); ITextImageClassification *make_TextImageSegByLogReg(); ITextImageClassification *make_TextImageSegByLeptonica(); // Global functions to operate on text/image probability map void remove_masked_region(bytearray &out,bytearray &mask,bytearray &in); void remove_rectangular_region(bytearray &out,rectarray &boxes,bytearray &in); void get_nontext_mask(bytearray &out,intarray &in); void get_nontext_boxes(rectarray &nontext_boxes,intarray &text_img_map); ISegmentPage *make_SegmentPageByMorphTrivial(); ISegmentPage *make_SegmentPageBy1CP(); ISegmentPage *make_SegmentPageByRAST(); ISegmentPage *make_SegmentPageByVORONOI(); ISegmentPage *make_SegmentPageByXYCUTS(); ISegmentPage *make_SegmentWords(); ISegmentLine *make_SegmentLineByProjection(); ISegmentLine *make_SegmentLineByCCS(); ISegmentLine *make_ConnectedComponentSegmenter(); ISegmentLine *make_CurvedCutSegmenter(); ISegmentLine *make_SkelSegmenter(); void count_neighbors(bytearray &result,bytearray &image); void find_endpoints(bytearray &result,bytearray &image); void find_junctions(bytearray &result,bytearray &image); void remove_singular_points(bytearray &image,int d); struct RegionExtractor { RegionExtractor(); ~RegionExtractor(); void setImage(intarray &image); void setImageMasked(intarray &image,int mask=0xffffff,int lo=1,int hi=999999999); void setPageColumns(intarray &image); void setPageParagraphs(intarray &image); void setPageLines(intarray &image); int length(); rectangle bbox(int i); void bounds(int i,int *x0=0,int *y0=0,int *x1=0,int *y1=0); int x0(int i); int y0(int i); int x1(int i); int y1(int i); void extract(bytearray &output,bytearray &input,int index,int margin=0); void extract(intarray &output,bytearray &input,int index,int margin=0); void extract(intarray &output,intarray &input,int index,int margin=0); void extract(floatarray &output,floatarray &input,int index,int margin=0); void extract(floatarray &output,bytearray &input,int index,int margin=0); void extract(floatarray &output,intarray &input,int index,int margin=0); void mask(bytearray &output,int index,int margin=0); void mask(intarray &output,int index,int margin=0); void mask(floatarray &output,int index,int margin=0); }; void debug_array(bytearray &); void debug_array(intarray &); void debug_array(floatarray &); void check_line_segmentation(intarray &cseg); void check_page_segmentation(intarray &cseg); void sort_by_xcenter(intarray &); struct IGrouper : IComponent { void setSegmentation(intarray &segmentation); int length(); void getMask(rectangle &r,bytearray &mask,int index,int margin); rectangle boundingBox(int index); void extract(bytearray &out,bytearray &mask,bytearray &source,int index,int grow=0); void extract(bytearray &out,bytearray &source,byte dflt,int index,int grow=0); void extract(floatarray &out,bytearray &mask,floatarray &source,int index,int grow=0); void extract(floatarray &out,floatarray &source,float dflt,int index,int grow=0); void setClass(int index,int cls,float cost); void getLattice(IGenericFst &fst); }; IGrouper *make_SimpleGrouper(); struct Pages { Pages(); ~Pages(); void clear(); void addFile(const char *file); void parseSpec(const char *spec); void wantGray(bool flag); void wantColor(bool flag); void setAutoInvert(bool flag); void setBinarizer(IBinarize *arg); int length(); void getPage(int index); bool nextPage(); void rewind(); void loadImage(); const char *getFileName(); bool hasGray(); bool hasColor(); void getBinary(bytearray &dst); void getGray(bytearray &dst); void getColor(intarray &dst); }; /* struct ILines : IComponent { virtual int pagesCount(); virtual void processPage(int index); virtual double getTotalElapsedTime(); virtual double getCurrentPageElapsedTime(); virtual int linesCount(); virtual void line(bytearray &result_image, bytearray &result_mask, int index); virtual int columnIndex(int line); virtual int paragraphIndex(int line); void grayPage(bytearray &); void binaryPage(bytearray &); void segmentation(intarray &r); virtual int pageWidth(); virtual int pageHeight(); virtual const char *pageDescription(); virtual rectangle bbox(int index); virtual void setBinarizer (IBinarize *); virtual void setDeskewer (ICleanupGray *); virtual void addCleanupGray (ICleanupGray *); virtual void addCleanupBinary (ICleanupBinary *); virtual void setPageSegmenter (ISegmentPage *); }; ILines *make_Lines(Pages *); ILines *make_Lines(const char *page_specs); */ /* void recognize(RecognizedPage &result, ILines &lines, ILineOCR &ocr, IGenericFst &fst, IBestPath &bestpath, int pageno = 0); $ IBestPath &as_IBestPath(ISearchableFst &s) { $ return s; $ } IBestPath &as_IBestPath(ISearchableFst &s); /// 1-pass recognition. void recognize(RecognizedPage &result, ILines &lines, ISegmentLine &lineseg, IRecognizeLine &rec, IGenericFst &fst, IBestPath &bestpath, float adaption_threshold, int pageno = 0); /// 1-pass recognition with adaptation. void recognize(RecognizedPage &result, ILines &lines, ISegmentLine &lineseg, IRecognizeLine &rec, IGenericFst &fst, IBestPath &bestpath, ITrainChars &trainable, float adaption_threshold, int pageno = 0); /// 2-pass recognition. void recognize(RecognizedPage &result, ILines &lines, ISegmentLine &lineseg, IRecognizeLine &pass1, IGenericFst &fst1, IBestPath &bestpath1, ITrainChars &trainable, IRecognizeLine &pass2, IGenericFst &fst2, IBestPath &bestpath2, float adaption_threshold, int pageno = 0); */ #define M_PI M_PI /* bool utf8_encode(bytearray &utf8, intarray &lengths, int unicode_char); bool utf8_encode(bytearray &utf8, intarray &lengths, nustring &text); bool utf8_encode(bytearray &utf8, int unicode_char); bool utf8_encode(bytearray &utf8, nustring &text); bool utf8_decode(int &result, int &length, bytearray &utf8, int offset); bool utf8_decode(nustring &text, intarray &lengths, bytearray &utf8); bool utf8_decode(nustring &text, bytearray &utf8); */ $ void str2array(bytearray &out,const char *s) { $ int n = strlen(s); $ out.resize(n); $ for(int i=0;i " pairs. ICharacterLibrary *make_pnglist_charlib(const char *list); /// Make a CharacterLibrary from a ocropus-generated pair of files. ICharacterLibrary *make_ocropus_charlib(const char *prefix); /// Make a CharacterLibrary from a forced-alignment-generated pair of files. ICharacterLibrary *make_SegmentationCharlib(const char *file_list_path, bool produce_garbage = false, const char* path = 0); /// Make a CharacterLibrary from a forced-alignment-generated pair of files. ICharacterLibrary *make_SegmentationCharlib(const char *image_path, const char *text_path, bool produce_garbage); // void dump_charlib(const char *path, ICharacterLibrary &); void parse_vector(intarray &, const char *); const char *get_version_string(); void set_version_string(const char *); $ const char *hardcoded_version_string(); const char *hardcoded_version_string(); /*void detect_headlines(RecognizedPage &, ILines &); void detect_paragraphs(RecognizedPage &, ILines &);*/ void concat_segmentation(intarray &dst, intarray &src, int x, int y); void remove_small_components(intarray &bimage,int mw,int mh); void remove_marginal_components(intarray &bimage,int x0,int y0,int x1,int y1); void remove_small_components(bytearray &bimage,int mw,int mh); void remove_marginal_components(bytearray &bimage,int x0,int y0,int x1,int y1); void runlength_histogram(floatarray &hist, bytearray &img,rectangle r,bool white=false,bool vert=false); void runlength_histogram(floatarray &hist, bytearray &img,bool white=false,bool vert=false); int find_median_in_histogram(floatarray &); void make_background_white(bytearray &); class IClassifier { virtual ~IClassifier() {} virtual void set(const char *name,double value) = 0; virtual int classify(floatarray &v) = 0; virtual int classify(float &cost,floatarray &v) = 0; virtual void discriminant(floatarray &result,floatarray &v) = 0; virtual void discriminant(float &cost,floatarray &result,floatarray &v) = 0; virtual void posterior(floatarray &result,floatarray &v) = 0; virtual void posterior(float &cost,floatarray &result,floatarray &v) = 0; virtual void add(floatarray &v,int c) = 0; virtual void train() = 0; virtual void save(FILE *stream) = 0; virtual void load(FILE *stream) = 0; void save(const char *path); void load(const char *path); }; ICharacterClassifier *make_AdaptClassifier( IClassifier*, bool output_garbage = false, const char* features = "111101110", int dim_x = 10, int dim_y = 10); ICharacterClassifier *make_BpnetCharacterClassifier( const char* strfeatures = "111101010", int dim_x = 10, int dim_y = 10); IClassifier *make_BpnetClassifier(); IClassifier *make_BpnetClassifierDumpIntoFile(const char *path); void set_resource_path(const char *path); void find_and_load_ICharacterClassifier(ICharacterClassifier &, const char *resource); void find_and_load_IRecognizeLine(IRecognizeLine &i, const char *resource); void extract_segment(bytearray &result, intarray &image, int n); IGenericFst *make_StandardFst(); void beam_search(intarray &ids, intarray &vertices, intarray &outputs, floatarray &costs, IGenericFst &graph, int beam_width = 100); void beam_search(nustring &result, IGenericFst &fst, int beam_width = 100); void beam_search_in_composition(intarray &inputs, intarray &vertices1, intarray &vertices2, intarray &outputs, floatarray &costs, IGenericFst &fst1, IGenericFst &fst2, int beam_width = 100, int override_start = -1, int override_finish = -1); void beam_search_in_composition(nustring &result, IGenericFst &fst, IGenericFst &fst2, int beam_width = 100, int override_start = -1, int override_finish = -1); bool a_star(intarray &ids, intarray &vertices, intarray &outputs, floatarray &costs, IGenericFst &graph); void a_star_backwards(floatarray &costs_for_all_nodes, IGenericFst &graph); bool a_star_in_composition(intarray &inputs, intarray &vertices1, intarray &vertices2, intarray &outputs, floatarray &costs, IGenericFst &fst1, IGenericFst &fst2); bool a_star_in_composition(intarray &inputs, intarray &vertices1, intarray &vertices2, intarray &vertices3, intarray &outputs, floatarray &costs, IGenericFst &fst1, IGenericFst &fst2, IGenericFst &fst3); void fst_write(const char *path, IGenericFst &fst); void fst_read(IGenericFst &fst, const char *path); /// Degrade a grayscale text image by applying Baird's degradation model. void degrade(bytearray &image, double jitter_mean = .2, double jitter_sigma = .1, double sensitivity_mean = .125, double sensitivity_sigma = .04, double threshold_mean = .4, double threshold_sigma = .04); class Logger { bool enabled; /// \brief /// Construct a logger with a given name /// and decide whether it's enabled or not. Logger(const char *name); /// Recolor a segmentation and log it. void recolor(const char *description, intarray &, float = 100.); /// Just log the message. void log(const char *message); /// Log a boolean value. void log(const char *message, bool); /// Log an integer value. void log(const char *message, int); /// Log a double value. void log(const char *message, double); /// Log a string. void log(const char *description, const char *); /// \brief Log a grayscale image. /// /// If the image is not 2-dimensional, /// it will be written as text. void log(const char *description, bytearray &, float = 100.); /// \brief Log a color image. /// /// If the image is not 2-dimensional, /// it will be written as text. void log(const char *description, intarray &, float = 100.); /// \brief Log an array of floats. /// void log(const char *description, floatarray &); /// Log a nuchar value. void log(const char *description, nuchar); /// Log a nustring value, decoding it to UTF-8. void log(const char *description, nustring &); /// Log a rectangle. void log(const char *description, rectangle &); // not good for Lua binding - shadows arrays //void log(const char *description, void *); /// Get a bestpath and log it. void log(const char *description, IGenericFst &); /// Increase indentation level in the log. void indent(); /// Decrease indentation level in the log. void dedent(); }; void set_logger_directory(const char *); const char *get_logger_directory(); void visualize_segmentation_by_RAST(intarray &result, bytearray &in_not_inverted); // stuff from ocr-segmentations.h /// Remove from the segmentation those pixels which are white in gray_image. void binarize_in_segmentation(intarray &segmentation, /* const */ bytearray &gray_image); /// Set line number for all foreground pixels in a character segmentation. void set_line_number(intarray &a, int lnum); /// If the line is too small or too large, rescale it (with the mask) /// to a decent height (30-60 pixels). void rescale_if_needed(bytearray &bin_line, bytearray &gray_line); /// Make a binary image from a line segmentation. void forget_segmentation(bytearray &image, intarray &segmentation); /// Return true if there are no zeros in the array. bool has_no_black_pixels(intarray &); void blit_segmentation_line(intarray &page, rectangle bbox, intarray &line, int line_no); /// Blit the segmentation of src onto dst shifted by (x,y) and shifted by /// values by max(dst). void concat_segmentation(intarray &dst, intarray &src, int x, int y); // Enlarge segmentation and AND it with line_mask. // Don't pass binarized grayscale image as line_mask, // otherwise you might get debris not from the line. // (that means we cannot really call this from inside LineOCR) void normalize_segmentation(intarray &segmentation, bytearray &line_mask); int max_cnum(intarray &seg); void remove_neighbour_line_components(bytearray &line); void rescore_path(IGenericFst &fst, intarray &inputs, intarray &vertices, intarray &outputs, floatarray &new_costs, int override_start = -1); void beam_search_and_rescore(IGenericFst &main, IGenericFst &transcript, double coef, int beam_width = 10, int override_start = -1, int override_finish = -1); void a_star_and_rescore(IGenericFst &main, IGenericFst &transcript, double coef); // __________________ editdist.h ____________________ float edit_distance(nustring &str1,nustring &str2, float del_cost=1, float ins_cost=1, float sub_cost=1); float block_move_edit_cost(nustring &from, nustring &to, float c); float block_move_edit_distance(nustring &a, nustring &b, float c); float block_move_edit_cost_record_jumps(intarray &jumps_from, intarray &jumps_to, nustring &from, nustring &to, float c); void analyze_jumps(bytearray &area_covered_by_non_jumps, intarray &from, intarray &to, int source_length); void get_text_jumped_over(nustring &result, bytearray &covered, nustring &text); float edit_cost_for_layout(intarray &jumps_from, intarray &jumps_to, nustring &from, nustring &to, float c); // ___________________________________________________ bool is_oversegmentation_of(intarray &s1, intarray &s2); // __________________________ line-info.h __________________________________ struct TextLineExtended { TextLineExtended(); float c; float m; float x; float a; float d; rectangle bbox; }; /// Another, more Lua-friendly interface for get_extended_line_info(). /// Packs all the line parameters into one array. bool get_extended_line_info(TextLineExtended &result, intarray &seg); /// Lua-friendly interface for get_extended_line_info_using_css(). bool get_extended_line_info_using_ccs(TextLineExtended &result, bytearray &seg); void paint_line(intarray &image, TextLineExtended l); void binarize_simple(bytearray &, bytearray &); void binarize_simple(bytearray &); void invert(bytearray &); bool segment_words_by_projection(intarray &seg, bytearray &in, int nwords); void rough_rescale(floatarray &dst, const floatarray &src, int w, int h); void bicubic_rescale(floatarray &dst, const floatarray &src, int w, int h); void rescale(floatarray &dst, const floatarray &src, int w, int h); void rescale(bytearray &dst, const bytearray &src, int w, int h); void rescale_to_width(floatarray &dst, const floatarray &src, int w); void rescale_to_width(bytearray &dst, const bytearray &src, int w); void rescale_to_height(floatarray &dst, const floatarray &src, int h); void rescale_to_height(bytearray &dst, const bytearray &src, int h); void paint_rectangles(intarray &image,rectarray &rectangles); void xml_unescape(nustring &, const char *); void rotate_90(bytearray &,bytearray &); void rotate_180(bytearray &,bytearray &); void rotate_270(bytearray &,bytearray &); // FIXME this doesn't belong here --tmb struct IFeatureStream { virtual int nsamples() = 0; virtual bool read(floatarray &, int &label) = 0; virtual void write(floatarray &, int label) = 0; virtual ~IFeatureStream(); }; // FIXME this doesn't belong here --tmb IFeatureStream *make_MnistReader(const char *prefix, bool search_in_data_dir = false); // FIXME this doesn't belong here --tmb IFeatureStream *make_MnistWriter(const char *prefix); // FIXME this doesn't belong here --tmb void MNIST_60K(bytearray &images, intarray &labels); // FIXME this doesn't belong here --tmb void MNIST_10K(bytearray &images, intarray &labels); struct ITrainableFst : IGenericFst { virtual void expectation(IGenericFst &left, IGenericFst &right); virtual void maximization(); }; ITrainableFst *make_StandardTrainableFst(); void fst_copy(IGenericFst &dst, IGenericFst &src); void fst_copy_best_arcs_only(IGenericFst &dst, IGenericFst &src); void fst_insert(IGenericFst &dst, IGenericFst &src, int start, int accept); } // module ocr $[ function import_all(module) for key, value in pairs(module) do _G[key] = value end end $]