00001
00002
00003
00004
00005
00006 #ifndef HPP_CPPTextExtractor
00007 #define HPP_CPPTextExtractor
00008
00009 #include <PDF/Page.h>
00010 #include <PDF/Rect.h>
00011 #include <Common/UString.h>
00012 #include <C/PDF/TRN_TextExtractor.h>
00013
00014 namespace pdftron {
00015 namespace PDF {
00016
00101 class TextExtractor
00102 {
00103 public:
00104
00108 TextExtractor();
00109 ~TextExtractor();
00110
00115 enum ProcessingFlags
00116 {
00117
00118
00119
00120 e_no_ligature_exp = 1,
00121
00122
00123
00124 e_no_dup_remove = 2,
00125
00126
00127
00128 e_punct_break = 4,
00129
00130
00131
00132
00133
00134 e_remove_hidden_text = 8,
00135
00136
00137
00138
00139
00140 e_no_invisible_text = 16
00141 };
00142
00152 void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
00153
00157 int GetWordCount();
00158
00172 void GetAsText(UString& out_str, bool dehyphen = true);
00173
00177 enum XMLOutputFlags
00178 {
00179
00180 e_words_as_elements = 1,
00181
00182
00183
00184 e_output_bbox = 2,
00185
00186
00187 e_output_style_info = 4
00188 };
00189
00232 void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
00233
00239 class Style
00240 {
00241 public:
00242
00249 SDF::Obj GetFont();
00250
00254 UString GetFontName();
00255
00264 double GetFontSize();
00265
00274 int GetWeight();
00275
00280 bool IsItalic();
00281
00287 bool IsSerif();
00288
00292 void GetColor(UInt8 rgb[3]);
00293
00294 bool operator== (const Style& s);
00295 bool operator!= (const Style& s);
00296
00297 Style();
00298
00300 Style(const Style& s);
00301 Style(TRN_TextExtractorStyle impl);
00302 TRN_TextExtractorStyle mp_style;
00304 };
00305
00311 class Word
00312 {
00313 public:
00317 int GetNumGlyphs();
00318
00325 void GetBBox(double out_bbox[4]);
00326
00331 void GetQuad(double out_quad[8]);
00332
00338 void GetGlyphQuad(int glyph_idx, double out_quad[8]);
00339
00344 Style GetCharStyle(int char_idx);
00345
00349 Style GetStyle();
00350
00354 int GetStringLen();
00355
00359 const Unicode* GetString();
00360
00364 Word GetNextWord();
00365
00371 int GetCurrentNum();
00372
00376 bool IsValid();
00377
00378 bool operator== (const Word&);
00379 bool operator!= (const Word&);
00380 Word();
00381
00383 Word(TRN_TextExtractorWord impl);
00384 TRN_TextExtractorWord mp_word;
00386 };
00387
00393 class Line {
00394 public:
00395
00399 int GetNumWords();
00400
00405 bool IsSimpleLine();
00406
00413 const double* GetBBox();
00414
00419 void GetQuad(double out_quad[8]);
00420
00425 Word GetFirstWord();
00426
00430 Word GetWord(int word_idx);
00431
00435 Line GetNextLine();
00436
00440 int GetCurrentNum();
00441
00445 Style GetStyle();
00446
00452 int GetParagraphID();
00453
00459 int GetFlowID();
00460
00465 bool EndsWithHyphen();
00466
00470 bool IsValid();
00471
00472 bool operator== (const Line&);
00473 bool operator!= (const Line&);
00474 Line();
00475
00477 Line(TRN_TextExtractorLine impl);
00478 TRN_TextExtractorLine mp_line;
00480 };
00481
00485 int GetNumLines();
00486
00487
00494 Line GetFirstLine();
00495
00496
00497 private:
00498 TRN_TextExtractor mp_extractor;
00499 };
00500
00501
00502
00503 #include <Impl/TextExtractor.inl>
00504
00505 };
00506 };
00507
00508 #endif