Ejemplo n.º 1
0
  @Test
  public void testParsePages() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
    assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));

    // Check the metadata values
    assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
    assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
    assertEquals("2", metadata.get(Metadata.PAGE_COUNT));

    String content = handler.toString();

    // text on page 1
    assertContains("Sample pages document", content);
    assertContains("Some plain text to parse.", content);
    assertContains("Cell one", content);
    assertContains("Cell two", content);
    assertContains("Cell three", content);
    assertContains("Cell four", content);
    assertContains("Cell five", content);
    assertContains("Cell six", content);
    assertContains("Cell seven", content);
    assertContains("Cell eight", content);
    assertContains("Cell nine", content);
    assertContains("Both Pages 1.x and Keynote 2.x", content); // ...

    // text on page 2
    assertContains("A second page....", content);
    assertContains("Extensible Markup Language", content); // ...
  }
Ejemplo n.º 2
0
  @Test
  public void testParseKeynote() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
    //        assertTrue("Metadata not found in " + metadataKeys,
    // metadataKeys.contains(Office.SLIDE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));

    // Check the metadata values
    assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
    assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
    assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
    assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));

    String content = handler.toString();
    assertContains("A sample presentation", content);
    assertContains("For the Apache Tika project", content);
    assertContains("Slide 1", content);
    assertContains("Some random text for the sake of testability.", content);
    assertContains("A nice comment", content);
    assertContains("A nice note", content);

    // test table data
    assertContains("Cell one", content);
    assertContains("Cell two", content);
    assertContains("Cell three", content);
    assertContains("Cell four", content);
    assertContains("Cell 5", content);
    assertContains("Cell six", content);
    assertContains("7", content);
    assertContains("Cell eight", content);
    assertContains("5/5/1985", content);
  }
Ejemplo n.º 3
0
  @Test
  public void testParseNumbers() throws Exception {
    InputStream input =
        IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();

    iWorkParser.parse(input, handler, metadata, parseContext);

    // Make sure enough keys came through
    // (Exact numbers will vary based on composites)
    assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
    List<String> metadataKeys = Arrays.asList(metadata.names());
    assertTrue(
        "Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
    assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
    assertTrue(
        "Metadata not found in " + metadataKeys,
        metadataKeys.contains(TikaCoreProperties.TITLE.getName()));

    // Check the metadata values
    assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
    assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));

    String content = handler.toString();
    assertContains("Category", content);
    assertContains("Home", content);
    assertContains("-226", content);
    assertContains("-137.5", content);
    assertContains("Checking Account: 300545668", content);
    assertContains("4650", content);
    assertContains("Credit Card", content);
    assertContains("Groceries", content);
    assertContains("-210", content);
    assertContains("Food", content);
    assertContains("Try adding your own account transactions to this table.", content);
  }