@Beta public class WordToTextConverter extends AbstractWordConverter
UNICODECHAR_NO_BREAK_SPACE, UNICODECHAR_NONBREAKING_HYPHEN, UNICODECHAR_ZERO_WIDTH_SPACE
Constructor and Description |
---|
WordToTextConverter()
Creates new instance of
WordToTextConverter . |
WordToTextConverter(org.w3c.dom.Document document)
Creates new instance of
WordToTextConverter . |
WordToTextConverter(TextDocumentFacade textDocumentFacade) |
Modifier and Type | Method and Description |
---|---|
protected void |
afterProcess()
Special actions that need to be called after processing complete, like
updating stylesheets or building document notes list.
|
org.w3c.dom.Document |
getDocument() |
java.lang.String |
getText() |
static java.lang.String |
getText(DirectoryNode root) |
static java.lang.String |
getText(java.io.File docFile) |
static java.lang.String |
getText(HWPFDocumentCore wordDocument) |
boolean |
isOutputSummaryInformation() |
static void |
main(java.lang.String[] args)
Java main() interface to interact with
WordToTextConverter |
protected void |
outputCharacters(org.w3c.dom.Element block,
CharacterRun characterRun,
java.lang.String text) |
protected void |
processBookmarks(HWPFDocumentCore wordDocument,
org.w3c.dom.Element currentBlock,
Range range,
int currentTableLevel,
java.util.List<Bookmark> rangeBookmarks)
Wrap range into bookmark(s) and process it.
|
protected void |
processDocumentInformation(SummaryInformation summaryInformation) |
void |
processDocumentPart(HWPFDocumentCore wordDocument,
Range range) |
protected void |
processDrawnObject(HWPFDocument doc,
CharacterRun characterRun,
OfficeDrawing officeDrawing,
java.lang.String path,
org.w3c.dom.Element block) |
protected void |
processEndnoteAutonumbered(HWPFDocument wordDocument,
int noteIndex,
org.w3c.dom.Element block,
Range endnoteTextRange) |
protected void |
processFootnoteAutonumbered(HWPFDocument wordDocument,
int noteIndex,
org.w3c.dom.Element block,
Range footnoteTextRange) |
protected void |
processHyperlink(HWPFDocumentCore wordDocument,
org.w3c.dom.Element currentBlock,
Range textRange,
int currentTableLevel,
java.lang.String hyperlink) |
protected void |
processImage(org.w3c.dom.Element currentBlock,
boolean inlined,
Picture picture) |
protected void |
processImage(org.w3c.dom.Element currentBlock,
boolean inlined,
Picture picture,
java.lang.String url) |
protected void |
processImageWithoutPicturesManager(org.w3c.dom.Element currentBlock,
boolean inlined,
Picture picture) |
protected void |
processLineBreak(org.w3c.dom.Element block,
CharacterRun characterRun) |
protected boolean |
processOle2(HWPFDocument wordDocument,
org.w3c.dom.Element block,
Entry entry) |
protected void |
processPageBreak(HWPFDocumentCore wordDocument,
org.w3c.dom.Element flow) |
protected void |
processPageref(HWPFDocumentCore wordDocument,
org.w3c.dom.Element currentBlock,
Range textRange,
int currentTableLevel,
java.lang.String pageref) |
protected void |
processParagraph(HWPFDocumentCore wordDocument,
org.w3c.dom.Element parentElement,
int currentTableLevel,
Paragraph paragraph,
java.lang.String bulletText) |
protected void |
processSection(HWPFDocumentCore wordDocument,
Section section,
int s) |
protected void |
processTable(HWPFDocumentCore wordDocument,
org.w3c.dom.Element flow,
Table table) |
void |
setOutputSummaryInformation(boolean outputDocumentInformation) |
getCharacterRunTriplet, getFontReplacer, getNumberColumnsSpanned, getNumberRowsSpanned, getPicturesManager, processCharacters, processDeadField, processDocument, processDrawnObject, processDropDownList, processField, processNoteAnchor, processParagraphes, processSingleSection, processSymbol, setFontReplacer, setPicturesManager, tryDeadField
public WordToTextConverter() throws javax.xml.parsers.ParserConfigurationException
WordToTextConverter
. Can be used for
output several HWPFDocument
s into single text document.javax.xml.parsers.ParserConfigurationException
- if an internal DocumentBuilder
cannot be createdpublic WordToTextConverter(org.w3c.dom.Document document)
WordToTextConverter
. Can be used for
output several HWPFDocument
s into single text document.document
- XML DOM Document used as storage for text piecespublic WordToTextConverter(TextDocumentFacade textDocumentFacade)
public static java.lang.String getText(DirectoryNode root) throws java.lang.Exception
java.lang.Exception
public static java.lang.String getText(java.io.File docFile) throws java.lang.Exception
java.lang.Exception
public static java.lang.String getText(HWPFDocumentCore wordDocument) throws java.lang.Exception
java.lang.Exception
public static void main(java.lang.String[] args) throws java.lang.Exception
WordToTextConverter
Usage: WordToTextConverter infile outfile
Where infile is an input .doc file ( Word 95-2007) which will be rendered as plain text into outfilejava.lang.Exception
protected void afterProcess()
AbstractWordConverter
afterProcess
in class AbstractWordConverter
public org.w3c.dom.Document getDocument()
getDocument
in class AbstractWordConverter
public java.lang.String getText() throws java.lang.Exception
java.lang.Exception
public boolean isOutputSummaryInformation()
protected void outputCharacters(org.w3c.dom.Element block, CharacterRun characterRun, java.lang.String text)
outputCharacters
in class AbstractWordConverter
protected void processBookmarks(HWPFDocumentCore wordDocument, org.w3c.dom.Element currentBlock, Range range, int currentTableLevel, java.util.List<Bookmark> rangeBookmarks)
AbstractWordConverter
processBookmarks
in class AbstractWordConverter
protected void processDocumentInformation(SummaryInformation summaryInformation)
processDocumentInformation
in class AbstractWordConverter
public void processDocumentPart(HWPFDocumentCore wordDocument, Range range)
processDocumentPart
in class AbstractWordConverter
protected void processDrawnObject(HWPFDocument doc, CharacterRun characterRun, OfficeDrawing officeDrawing, java.lang.String path, org.w3c.dom.Element block)
processDrawnObject
in class AbstractWordConverter
protected void processEndnoteAutonumbered(HWPFDocument wordDocument, int noteIndex, org.w3c.dom.Element block, Range endnoteTextRange)
processEndnoteAutonumbered
in class AbstractWordConverter
protected void processFootnoteAutonumbered(HWPFDocument wordDocument, int noteIndex, org.w3c.dom.Element block, Range footnoteTextRange)
processFootnoteAutonumbered
in class AbstractWordConverter
protected void processHyperlink(HWPFDocumentCore wordDocument, org.w3c.dom.Element currentBlock, Range textRange, int currentTableLevel, java.lang.String hyperlink)
processHyperlink
in class AbstractWordConverter
protected void processImage(org.w3c.dom.Element currentBlock, boolean inlined, Picture picture)
processImage
in class AbstractWordConverter
protected void processImage(org.w3c.dom.Element currentBlock, boolean inlined, Picture picture, java.lang.String url)
processImage
in class AbstractWordConverter
protected void processImageWithoutPicturesManager(org.w3c.dom.Element currentBlock, boolean inlined, Picture picture)
processImageWithoutPicturesManager
in class AbstractWordConverter
protected void processLineBreak(org.w3c.dom.Element block, CharacterRun characterRun)
processLineBreak
in class AbstractWordConverter
protected boolean processOle2(HWPFDocument wordDocument, org.w3c.dom.Element block, Entry entry) throws java.lang.Exception
processOle2
in class AbstractWordConverter
java.lang.Exception
protected void processPageBreak(HWPFDocumentCore wordDocument, org.w3c.dom.Element flow)
processPageBreak
in class AbstractWordConverter
protected void processPageref(HWPFDocumentCore wordDocument, org.w3c.dom.Element currentBlock, Range textRange, int currentTableLevel, java.lang.String pageref)
processPageref
in class AbstractWordConverter
protected void processParagraph(HWPFDocumentCore wordDocument, org.w3c.dom.Element parentElement, int currentTableLevel, Paragraph paragraph, java.lang.String bulletText)
processParagraph
in class AbstractWordConverter
protected void processSection(HWPFDocumentCore wordDocument, Section section, int s)
processSection
in class AbstractWordConverter
protected void processTable(HWPFDocumentCore wordDocument, org.w3c.dom.Element flow, Table table)
processTable
in class AbstractWordConverter
public void setOutputSummaryInformation(boolean outputDocumentInformation)
Copyright 2021 The Apache Software Foundation or its licensors, as applicable.