Example #1
1
 // TIKA- 924
 @Test
 public void testParseNumbersTableNames() throws Exception {
   InputStream input =
       IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
   Metadata metadata = new Metadata();
   ContentHandler handler = new BodyContentHandler();
   iWorkParser.parse(input, handler, metadata, parseContext);
   String content = handler.toString();
   assertContains("This is the main table", content);
 }
Example #2
0
  /** Check we get headers, footers and footnotes from Pages */
  @Test
  public void testParsePagesHeadersFootersFootnotes() throws Exception {
    String footnote = "Footnote: Do a lot of people really use iWork?!?!";
    String header = "THIS IS SOME HEADER TEXT";
    String footer = "THIS IS SOME FOOTER TEXT\t1";
    String footer2 = "THIS IS SOME FOOTER TEXT\t2";

    InputStream input =
        IWorkParserTest.class.getResourceAsStream(
            "/test-documents/testPagesHeadersFootersFootnotes.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);
    String contents = handler.toString();

    // Check regular text
    assertContains("Both Pages 1.x", contents); // P1
    assertContains("understanding the Pages document", contents); // P1
    assertContains("should be page 2", contents); // P2

    // Check for headers, footers and footnotes
    assertContains(header, contents);
    assertContains(footer, contents);
    assertContains(footer2, contents);
    assertContains(footnote, contents);
  }
Example #3
0
 /**
  * Check the given InputStream is not closed by the Parser (TIKA-1117).
  *
  * @throws Exception
  */
 @Test
 public void testStreamNotClosed() throws Exception {
   InputStream input =
       IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
   Metadata metadata = new Metadata();
   ContentHandler handler = new BodyContentHandler();
   iWorkParser.parse(input, handler, metadata, parseContext);
   input.read(); // Will throw an Exception if the stream was already closed.
 }
Example #4
0
  @Test
  public void testParsePages() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
    assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));

    // Check the metadata values
    assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
    assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
    assertEquals("2", metadata.get(Metadata.PAGE_COUNT));

    String content = handler.toString();

    // text on page 1
    assertContains("Sample pages document", content);
    assertContains("Some plain text to parse.", content);
    assertContains("Cell one", content);
    assertContains("Cell two", content);
    assertContains("Cell three", content);
    assertContains("Cell four", content);
    assertContains("Cell five", content);
    assertContains("Cell six", content);
    assertContains("Cell seven", content);
    assertContains("Cell eight", content);
    assertContains("Cell nine", content);
    assertContains("Both Pages 1.x and Keynote 2.x", content); // ...

    // text on page 2
    assertContains("A second page....", content);
    assertContains("Extensible Markup Language", content); // ...
  }
Example #5
0
  // TIKA-923
  @Test
  public void testKeynoteTables() throws Exception {
    InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    String content = handler.toString();
    content = content.replaceAll("\\s+", " ");
    assertContains("row 1 row 2 row 3", content);
  }
Example #6
0
  @Test
  public void testParseKeynote() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
    //        assertTrue("Metadata not found in " + metadataKeys,
    // metadataKeys.contains(Office.SLIDE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));

    // Check the metadata values
    assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
    assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
    assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
    assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));

    String content = handler.toString();
    assertContains("A sample presentation", content);
    assertContains("For the Apache Tika project", content);
    assertContains("Slide 1", content);
    assertContains("Some random text for the sake of testability.", content);
    assertContains("A nice comment", content);
    assertContains("A nice note", content);

    // test table data
    assertContains("Cell one", content);
    assertContains("Cell two", content);
    assertContains("Cell three", content);
    assertContains("Cell four", content);
    assertContains("Cell 5", content);
    assertContains("Cell six", content);
    assertContains("7", content);
    assertContains("Cell eight", content);
    assertContains("5/5/1985", content);
  }
Example #7
0
 // TIKA-918
 @Test
 public void testNumbersExtractChartNames() throws Exception {
   InputStream input =
       IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
   Metadata metadata = new Metadata();
   ContentHandler handler = new BodyContentHandler();
   iWorkParser.parse(input, handler, metadata, parseContext);
   String contents = handler.toString();
   assertContains("Expenditure by Category", contents);
   assertContains("Currency Chart name", contents);
   assertContains("Chart 2", contents);
 }
Example #8
0
  // TIKA-910
  @Test
  public void testKeynoteBulletPoints() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    String content = handler.toString();
    assertTrue(
        content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
  }
Example #9
0
  @Test
  public void testParseNumbers() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
    assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));

    // Check the metadata values
    assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
    assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));

    String content = handler.toString();
    assertContains("Category", content);
    assertContains("Home", content);
    assertContains("-226", content);
    assertContains("-137.5", content);
    assertContains("Checking Account: 300545668", content);
    assertContains("4650", content);
    assertContains("Credit Card", content);
    assertContains("Groceries", content);
    assertContains("-210", content);
    assertContains("Food", content);
    assertContains("Try adding your own account transactions to this table.", content);
  }
Example #10
0
  @Test
  public void testParseNumbersTableHeaders() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    String content = handler.toString();
    for (int header = 1; header <= 5; header++) {
      assertContains("header" + header, content);
    }
    for (int row = 1; row <= 3; row++) {
      assertContains("row" + row, content);
    }
  }
Example #11
0
  // TIKA-904
  @Test
  public void testPagesLayoutMode() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);

    String content = handler.toString();
    assertContains("text box 1 - here is some text", content);
    assertContains("created in a text box in layout mode", content);
    assertContains("text box 2 - more text!@!$@#", content);
    assertContains("this is text inside of a green box", content);
    assertContains("text inside of a green circle", content);
  }
Example #12
0
  /**
   * We don't currently support password protected Pages files, as we don't know how the encryption
   * works (it's not regular Zip Encryption). See TIKA-903 for details
   */
  @Test
  public void testParsePagesPasswordProtected() throws Exception {
    // Document password is "tika", but we can't use that yet...
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);

    // Content will be empty
    String content = handler.toString();
    assertEquals("", content);

    // Will have been identified as encrypted
    assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
  }
Example #13
0
  /** Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber. */
  @Test
  public void testParsePagesHeadersAlphaLower() throws Exception {
    String header = "THIS IS SOME HEADER TEXT";
    String footer = "THIS IS SOME FOOTER TEXT\ta";
    String footer2 = "THIS IS SOME FOOTER TEXT\tb";

    InputStream input =
        IWorkParserTest.class.getResourceAsStream(
            "/test-documents/testPagesHeadersFootersAlphaLower.pages");
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, new Metadata(), parseContext);
    String contents = handler.toString();

    // Check for headers, footers and footnotes
    assertContains(header, contents);
    assertContains(footer, contents);
    assertContains(footer2, contents);
  }
Example #14
0
  /** Check we get annotations (eg comments) from Pages */
  @Test
  public void testParsePagesAnnotations() throws Exception {
    String commentA = "comment about the APXL file";
    String commentB = "comment about UIMA";

    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);
    String contents = handler.toString();

    // Check regular text
    assertContains("Both Pages 1.x", contents); // P1
    assertContains("understanding the Pages document", contents); // P1
    assertContains("should be page 2", contents); // P2

    // Check for comments
    assertContains(commentA, contents);
    assertContains(commentB, contents);
  }