/** Test model based extraction */ public void testExtractFromModel() { Range r = doc.getRange(); String[] text = new String[r.numParagraphs()]; for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); text[i] = p.text(); } assertEquals(p_text.length, text.length); for (int i = 0; i < p_text.length; i++) { assertEquals(p_text[i], text[i]); } }
@Override protected void processParagraphes( HWPFDocument wordDocument, Element flow, Range range, int currentTableLevel) { // TODO mc process paragraphes final ListTables listTables = wordDocument.getListTables(); int currentListInfo = 0; final int paragraphs = range.numParagraphs(); for (int p = 0; p < paragraphs; p++) { Paragraph paragraph = range.getParagraph(p); // 加入图片 CharacterRun cr = paragraph.getCharacterRun(0); this.processImage(flow, cr); // table if (paragraph.isInTable() && paragraph.getTableLevel() != currentTableLevel) { if (paragraph.getTableLevel() < currentTableLevel) throw new IllegalStateException( "Trying to process table cell with higher level (" + paragraph.getTableLevel() + ") than current table level (" + currentTableLevel + ") as inner table part"); Table table = range.getTable(paragraph); processTable(wordDocument, flow, table); p += table.numParagraphs(); p--; continue; } // 换页 if (paragraph.text().equals("\u000c")) { processPageBreak(wordDocument, flow); } if (paragraph.getIlfo() != currentListInfo) { currentListInfo = paragraph.getIlfo(); } // 嵌套段落 if (currentListInfo != 0) { if (listTables != null) { final ListFormatOverride listFormatOverride = listTables.getOverride(paragraph.getIlfo()); String label = getBulletText(listTables, paragraph, listFormatOverride.getLsid()); if ("".equals(label)) { itemSymbol = true; /* Element span = htmlDocumentFacade.getDocument().createElement("span"); span.setAttribute("style", "font-size:12.0pt;line-height:150%;font-family:Wingdings;mso-ascii-font-family:Wingdings;mso-hide:none;mso-ansi-language:EN-US;mso-fareast-language:ZH-CN;font-weight:normal;mso-bidi-font-weight:normal;font-style:normal;mso-bidi-font-style:normal;text-underline:windowtext none;text-decoration:none;background:transparent"); span.setTextContent("Ø"); flow.appendChild(span); */ } processParagraph(wordDocument, flow, currentTableLevel, paragraph, label); } else { logger.log( POILogger.WARN, "Paragraph #" + paragraph.getStartOffset() + "-" + paragraph.getEndOffset() + " has reference to list structure #" + currentListInfo + ", but listTables not defined in file"); processParagraph( wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY); } } else { processParagraph(wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY); } } }
protected void processParagraph( HWPFDocument hwpfDocument, Element parentElement, int currentTableLevel, Paragraph paragraph, String bulletText) { final Element pElement = htmlDocumentFacade.createParagraph(); parentElement.appendChild(pElement); /*if(itemSymbol) System.out.println(itemSymbol);*/ if (itemSymbol) { Element span = htmlDocumentFacade.getDocument().createElement("span"); htmlDocumentFacade.addStyleClass( span, "itemSymbol", "font-size:12.0pt;line-height:150%;font-family:Wingdings;mso-ascii-font-family:Wingdings;mso-hide:none;mso-ansi-language:EN-US;mso-fareast-language:ZH-CN;font-weight:normal;mso-bidi-font-weight:normal;font-style:normal;mso-bidi-font-style:normal;text-underline:windowtext none;text-decoration:none;background:transparent"); span.setTextContent("Ø"); pElement.appendChild(span); itemSymbol = false; } StringBuilder style = new StringBuilder(); WordToHtmlUtils.addParagraphProperties(paragraph, style); final int charRuns = paragraph.numCharacterRuns(); if (charRuns == 0) { return; } { final String pFontName; final int pFontSize; final CharacterRun characterRun = paragraph.getCharacterRun(0); if ("".equals(paragraph.text().trim())) { pElement.setTextContent(String.valueOf(UNICODECHAR_NO_BREAK_SPACE)); } if (characterRun != null) { Triplet triplet = getCharacterRunTriplet(characterRun); pFontSize = characterRun.getFontSize() / 2; pFontName = triplet.fontName; WordToHtmlUtils.addFontFamily(pFontName, style); WordToHtmlUtils.addFontSize(pFontSize, style); } else { pFontSize = -1; pFontName = WordToHtmlUtils.EMPTY; } blocksProperies.push(new BlockProperies(pFontName, pFontSize)); } try { if (WordToHtmlUtils.isNotEmpty(bulletText)) { if (bulletText.endsWith("\t")) { /* * We don't know how to handle all cases in HTML, but at * least simplest case shall be handled */ final float defaultTab = TWIPS_PER_INCH / 2; float firstLinePosition = paragraph.getIndentFromLeft() + paragraph.getFirstLineIndent() + 20; // char have // some space float nextStop = (float) (Math.ceil(firstLinePosition / defaultTab) * defaultTab); final float spanMinWidth = nextStop - firstLinePosition; Element span = htmlDocumentFacade.getDocument().createElement("span"); htmlDocumentFacade.addStyleClass( span, "s", "display: inline-block; text-indent: 0; min-width: " + (spanMinWidth / TWIPS_PER_INCH) + "in;"); pElement.appendChild(span); Text textNode = htmlDocumentFacade.createText( bulletText.substring(0, bulletText.length() - 1) + UNICODECHAR_ZERO_WIDTH_SPACE + UNICODECHAR_NO_BREAK_SPACE); span.appendChild(textNode); } else { Text textNode = htmlDocumentFacade.createText(bulletText.substring(0, bulletText.length() - 1)); pElement.appendChild(textNode); } } processCharacters(hwpfDocument, currentTableLevel, paragraph, pElement); } finally { blocksProperies.pop(); } if (style.length() > 0) htmlDocumentFacade.addStyleClass(pElement, "p", style.toString()); WordToHtmlUtils.compactSpans(pElement); return; }