Information extraction from paper documents. Bayer, T., Bohnacker, U., & Renz, I. Information extraction from paper documents, pages 653--677. World Scientific Publishing Company, 1997.
abstract   bibtex   
Extracting information from paper documents opens a variety of innovative applications by supporting people in their daily processing of documents. In this chapter, a system that interprets text on paper documents given the restricted domain of a certain application is presented. The system consists of four components. The Document Image Analysis component transforms the text of the scanned document image into an electronic format represented by a sequence of word hypotheses. Based on this sequence, three components extract the information necessary for automatic processing of documents. First, the information being enclosed in structured text is extracted, such as the sender and recipient of business letters, or title and author of scientific papers. Second, the text body of a message is mapped to a certain pre-defined category. In the final step, this text is analyzed and the information which is relevant for the current application is extracted. It is shown that for a real-world application the paper documents can be completely interpreted, resulting in an automatically generated answering letter. The system is fast, fault tolerant with respect to misspelling or recognition errors, and readily adaptable to new applications.
@inBook{
 title = {Information extraction from paper documents},
 type = {inBook},
 year = {1997},
 keywords = {Document processing,Document understanding,Information extraction,Linguistic analysis,OCR,Text categorization},
 pages = {653--677},
 publisher = {World Scientific Publishing Company},
 editors = {[object Object],[object Object]},
 id = {90b35fb0-0ed8-364a-9648-8469122734d4},
 created = {2011-12-29T19:53:53.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Bayer1997},
 private_publication = {false},
 abstract = {Extracting information from paper documents opens a variety of innovative applications by supporting people in their daily processing of documents. In this chapter, a system that interprets text on paper documents given the restricted domain of a certain application is presented. The system consists of four components. The Document Image Analysis component transforms the text of the scanned document image into an electronic format represented by a sequence of word hypotheses. Based on this sequence, three components extract the information necessary for automatic processing of documents. First, the information being enclosed in structured text is extracted, such as the sender and recipient of business letters, or title and author of scientific papers. Second, the text body of a message is mapped to a certain pre-defined category. In the final step, this text is analyzed and the information which is relevant for the current application is extracted. It is shown that for a real-world application the paper documents can be completely interpreted, resulting in an automatically generated answering letter. The system is fast, fault tolerant with respect to misspelling or recognition errors, and readily adaptable to new applications.},
 bibtype = {inBook},
 author = {Bayer, Thomas and Bohnacker, Uli and Renz, Ingrid},
 book = {Handbook on Optical Character Recognition and Document Image Analysis}
}

Downloads: 0