/** Test model based extraction */
  public void testExtractFromModel() {
    Range r = doc.getRange();

    String[] text = new String[r.numParagraphs()];
    for (int i = 0; i < r.numParagraphs(); i++) {
      Paragraph p = r.getParagraph(i);
      text[i] = p.text();
    }

    assertEquals(p_text.length, text.length);
    for (int i = 0; i < p_text.length; i++) {
      assertEquals(p_text[i], text[i]);
    }
  }
Exemplo n.º 2
0
  @Override
  protected void processParagraphes(
      HWPFDocument wordDocument, Element flow, Range range, int currentTableLevel) {
    // TODO  mc process paragraphes

    final ListTables listTables = wordDocument.getListTables();
    int currentListInfo = 0;

    final int paragraphs = range.numParagraphs();
    for (int p = 0; p < paragraphs; p++) {
      Paragraph paragraph = range.getParagraph(p);

      //			加入图片
      CharacterRun cr = paragraph.getCharacterRun(0);
      this.processImage(flow, cr);
      //          table
      if (paragraph.isInTable() && paragraph.getTableLevel() != currentTableLevel) {
        if (paragraph.getTableLevel() < currentTableLevel)
          throw new IllegalStateException(
              "Trying to process table cell with higher level ("
                  + paragraph.getTableLevel()
                  + ") than current table level ("
                  + currentTableLevel
                  + ") as inner table part");

        Table table = range.getTable(paragraph);
        processTable(wordDocument, flow, table);

        p += table.numParagraphs();
        p--;
        continue;
      }
      //          换页
      if (paragraph.text().equals("\u000c")) {
        processPageBreak(wordDocument, flow);
      }
      if (paragraph.getIlfo() != currentListInfo) {
        currentListInfo = paragraph.getIlfo();
      }
      //          嵌套段落
      if (currentListInfo != 0) {
        if (listTables != null) {

          final ListFormatOverride listFormatOverride = listTables.getOverride(paragraph.getIlfo());

          String label = getBulletText(listTables, paragraph, listFormatOverride.getLsid());

          if ("".equals(label)) {
            itemSymbol = true;
            /*
            Element span = htmlDocumentFacade.getDocument().createElement("span");
            span.setAttribute("style", "font-size:12.0pt;line-height:150%;font-family:Wingdings;mso-ascii-font-family:Wingdings;mso-hide:none;mso-ansi-language:EN-US;mso-fareast-language:ZH-CN;font-weight:normal;mso-bidi-font-weight:normal;font-style:normal;mso-bidi-font-style:normal;text-underline:windowtext none;text-decoration:none;background:transparent");
            span.setTextContent("Ø");

            flow.appendChild(span);
            */
          }

          processParagraph(wordDocument, flow, currentTableLevel, paragraph, label);
        } else {
          logger.log(
              POILogger.WARN,
              "Paragraph #"
                  + paragraph.getStartOffset()
                  + "-"
                  + paragraph.getEndOffset()
                  + " has reference to list structure #"
                  + currentListInfo
                  + ", but listTables not defined in file");

          processParagraph(
              wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY);
        }
      } else {
        processParagraph(wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY);
      }
    }
  }
Exemplo n.º 3
0
  protected void processParagraph(
      HWPFDocument hwpfDocument,
      Element parentElement,
      int currentTableLevel,
      Paragraph paragraph,
      String bulletText) {
    final Element pElement = htmlDocumentFacade.createParagraph();
    parentElement.appendChild(pElement);
    /*if(itemSymbol)
    System.out.println(itemSymbol);*/
    if (itemSymbol) {
      Element span = htmlDocumentFacade.getDocument().createElement("span");
      htmlDocumentFacade.addStyleClass(
          span,
          "itemSymbol",
          "font-size:12.0pt;line-height:150%;font-family:Wingdings;mso-ascii-font-family:Wingdings;mso-hide:none;mso-ansi-language:EN-US;mso-fareast-language:ZH-CN;font-weight:normal;mso-bidi-font-weight:normal;font-style:normal;mso-bidi-font-style:normal;text-underline:windowtext none;text-decoration:none;background:transparent");
      span.setTextContent("Ø");
      pElement.appendChild(span);
      itemSymbol = false;
    }

    StringBuilder style = new StringBuilder();
    WordToHtmlUtils.addParagraphProperties(paragraph, style);

    final int charRuns = paragraph.numCharacterRuns();
    if (charRuns == 0) {
      return;
    }

    {
      final String pFontName;
      final int pFontSize;
      final CharacterRun characterRun = paragraph.getCharacterRun(0);
      if ("".equals(paragraph.text().trim())) {
        pElement.setTextContent(String.valueOf(UNICODECHAR_NO_BREAK_SPACE));
      }
      if (characterRun != null) {
        Triplet triplet = getCharacterRunTriplet(characterRun);
        pFontSize = characterRun.getFontSize() / 2;
        pFontName = triplet.fontName;
        WordToHtmlUtils.addFontFamily(pFontName, style);
        WordToHtmlUtils.addFontSize(pFontSize, style);
      } else {
        pFontSize = -1;
        pFontName = WordToHtmlUtils.EMPTY;
      }
      blocksProperies.push(new BlockProperies(pFontName, pFontSize));
    }
    try {
      if (WordToHtmlUtils.isNotEmpty(bulletText)) {
        if (bulletText.endsWith("\t")) {
          /*
           * We don't know how to handle all cases in HTML, but at
           * least simplest case shall be handled
           */
          final float defaultTab = TWIPS_PER_INCH / 2;
          float firstLinePosition =
              paragraph.getIndentFromLeft() + paragraph.getFirstLineIndent() + 20; // char have
          // some space

          float nextStop = (float) (Math.ceil(firstLinePosition / defaultTab) * defaultTab);

          final float spanMinWidth = nextStop - firstLinePosition;

          Element span = htmlDocumentFacade.getDocument().createElement("span");
          htmlDocumentFacade.addStyleClass(
              span,
              "s",
              "display: inline-block; text-indent: 0; min-width: "
                  + (spanMinWidth / TWIPS_PER_INCH)
                  + "in;");
          pElement.appendChild(span);

          Text textNode =
              htmlDocumentFacade.createText(
                  bulletText.substring(0, bulletText.length() - 1)
                      + UNICODECHAR_ZERO_WIDTH_SPACE
                      + UNICODECHAR_NO_BREAK_SPACE);
          span.appendChild(textNode);
        } else {
          Text textNode =
              htmlDocumentFacade.createText(bulletText.substring(0, bulletText.length() - 1));
          pElement.appendChild(textNode);
        }
      }

      processCharacters(hwpfDocument, currentTableLevel, paragraph, pElement);
    } finally {
      blocksProperies.pop();
    }

    if (style.length() > 0) htmlDocumentFacade.addStyleClass(pElement, "p", style.toString());

    WordToHtmlUtils.compactSpans(pElement);
    return;
  }