@Test public void testCommentedJpeg() throws Exception { InputStream stream = Thread.currentThread() .getContextClassLoader() .getResourceAsStream("testJPEG_commented.jpg"); /* * The dates in testJPEG_commented.jpg do not contain timezones. If no timezone is specified, * the Tika input transformer assumes the local time zone. Set the system timezone to UTC * so we can do assertions. */ TimeZone defaultTimeZone = TimeZone.getDefault(); TimeZone.setDefault(TimeZone.getTimeZone("UTC")); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Tosteberga \u00C4ngar")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"Keywords\" content=\"bird watching\"/>")); assertThat(metacard.getContentTypeName(), is("image/jpeg")); assertThat(convertDate(metacard.getCreatedDate()), is("2010-07-28 11:02:00 UTC")); // Reset timezone back to local time zone. TimeZone.setDefault(defaultTimeZone); }
@Test public void testGeoTaggedJpeg() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testJPEG_GEO.jpg"); /* * The dates in testJPED_GEO.jpg do not contain timezones. If no timezone is specified, * the Tika input transformer assumes the local time zone. Set the system timezone to UTC * so we can do assertions. */ TimeZone defaultTimeZone = TimeZone.getDefault(); TimeZone.setDefault(TimeZone.getTimeZone("UTC")); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"Model\" content=\"Canon EOS 40D\"/>")); assertThat(metacard.getContentTypeName(), is("image/jpeg")); assertThat(convertDate(metacard.getCreatedDate()), is("2009-08-11 09:09:45 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2009-10-02 23:02:49 UTC")); assertThat( (String) metacard.getAttribute(Metacard.GEOGRAPHY).getValue(), is("POINT(-54.1234 12.54321)")); // Reset timezone back to local time zone. TimeZone.setDefault(defaultTimeZone); }
@Test public void testOpenOffice() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testOpenOffice2.odt"); /* * The dates in testOpenOffice2.odt do not contain timezones. If no timezone is specified, * the Tika input transformer assumes the local time zone. Set the system timezone to UTC * so we can do assertions. */ TimeZone defaultTimeZone = TimeZone.getDefault(); TimeZone.setDefault(TimeZone.getTimeZone("UTC")); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Test OpenOffice2 Document")); assertThat(convertDate(metacard.getCreatedDate()), is("2007-09-14 11:06:08 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2013-02-13 06:52:10 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("This is a sample Open Office document, written in NeoOffice 2.2.1")); assertThat(metacard.getContentTypeName(), is("application/vnd.oasis.opendocument.text")); // Reset timezone back to local time zone. TimeZone.setDefault(defaultTimeZone); }
@Test public void testGroovySource() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testGROOVY.groovy"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("this is a comment")); assertThat(metacard.getContentTypeName(), containsString("text/plain")); }
@Test public void testAudioAiff() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testAIFF.aif"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("PCM_SIGNED")); assertThat(metacard.getContentTypeName(), is("audio/x-aiff")); }
@Test public void testAudioMidi() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testMID.mid"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("PPQ")); assertThat(metacard.getContentTypeName(), is("audio/midi")); }
@Test public void testCppSource() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testCPP.cpp"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("Hello world example")); assertThat(metacard.getContentTypeName(), containsString("text/plain")); }
@Test public void testXml() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testXML.xml"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Test Document")); assertThat(convertDate(metacard.getCreatedDate()), is("2000-12-01 00:00:00 UTC")); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("John Smith")); assertThat(metacard.getContentTypeName(), is("application/xml")); }
@Test public void testJavaSource() throws Exception { InputStream stream = Thread.currentThread() .getContextClassLoader() .getResourceAsStream("testpackage/testJAVA.java"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("HelloWorld")); assertThat(metacard.getContentTypeName(), containsString("text/plain")); }
@Test public void testPng() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testPNG.png"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"Compression Lossless\" content=\"true\"/>")); assertThat(metacard.getContentTypeName(), is("image/png")); }
@Test public void testBmp() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testBMP.bmp"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"Compression CompressionTypeName\" content=\"BI_RGB\"/>")); assertThat(metacard.getContentTypeName(), is("image/x-ms-bmp")); }
@Test public void testTiff() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testTIFF.tif"); Metacard metacard = transform(stream); assertNotNull(metacard); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"tiff:BitsPerSample\" content=\"8\"/>")); assertThat(metacard.getContentTypeName(), is("image/tiff")); }
@Test public void testMp3() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testMP3id3v1_v2.mp3"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Test Title")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"xmpDM:artist\" content=\"Test Artist\"/>")); assertThat(metacard.getContentTypeName(), is("audio/mpeg")); }
@Test public void testPpt() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testPPT.ppt"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Sample Powerpoint Slide")); assertThat(convertDate(metacard.getCreatedDate()), is("2007-09-14 17:33:12 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2007-09-14 19:16:39 UTC")); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("Created with Microsoft")); assertThat(metacard.getContentTypeName(), is("application/vnd.ms-powerpoint")); }
@Test public void testJavaClass() throws Exception { InputStream stream = Thread.currentThread() .getContextClassLoader() .getResourceAsStream("CatalogFrameworkImpl.class"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("CatalogFrameworkImpl")); assertNotNull(metacard.getMetadata()); assertThat(metacard.getMetadata(), containsString("DEFAULT_RESOURCE_NOT_FOUND_MESSAGE")); assertThat(metacard.getContentTypeName(), is("application/java-vm")); }
@Test public void testPDF() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testPDF.pdf"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Apache Tika - Apache Tika")); assertThat(convertDate(metacard.getCreatedDate()), is("2007-09-15 09:02:31 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2007-09-15 09:02:31 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"xmpTPg:NPages\" content=\"1\"/>")); assertThat(metacard.getContentTypeName(), is("application/pdf")); }
@Test public void testMp4() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testMP4.m4a"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Test Title")); assertThat(convertDate(metacard.getCreatedDate()), is("2012-01-28 18:39:18 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2012-01-28 18:40:25 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<meta name=\"xmpDM:artist\" content=\"Test Artist\"/>")); assertThat(metacard.getContentTypeName(), is("audio/mp4")); }
@Test public void testXls() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testEXCEL.xls"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Simple Excel document")); assertThat(convertDate(metacard.getCreatedDate()), is("2007-10-01 16:13:56 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2007-10-01 16:31:43 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("Written and saved in Microsoft Excel X for Mac Service Release 1.")); assertThat(metacard.getContentTypeName(), is("application/vnd.ms-excel")); }
protected void verifyBasics(Metacard metacard) { assertEquals(DEFAULT_TITLE, metacard.getTitle()); assertEquals(DEFAULT_TYPE, metacard.getContentTypeName()); assertEquals(DEFAULT_VERSION, metacard.getContentTypeVersion()); assertEquals( sampleMetadata().replaceAll("\\s", ""), metacard.getMetadata().replaceAll("\\s", "")); }
@Test public void testWordDoc() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testWORD.docx"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Sample Word Document")); assertThat(convertDate(metacard.getCreatedDate()), is("2008-12-11 16:04:00 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2010-11-12 16:21:00 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("<p>This is a sample Microsoft Word Document.</p>")); assertThat( metacard.getContentTypeName(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); }
@Test public void testPptx() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testPPT.pptx"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Attachment Test")); assertThat(convertDate(metacard.getCreatedDate()), is("2010-05-04 06:43:54 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2010-06-29 06:34:35 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("content as every other file being tested for tika content parsing")); assertThat( metacard.getContentTypeName(), is("application/vnd.openxmlformats-officedocument.presentationml.presentation")); }
@Test public void testXlsx() throws Exception { InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("testEXCEL.xlsx"); Metacard metacard = transform(stream); assertNotNull(metacard); assertThat(metacard.getTitle(), is("Simple Excel document")); assertThat(convertDate(metacard.getCreatedDate()), is("2007-10-01 16:13:56 UTC")); assertThat(convertDate(metacard.getModifiedDate()), is("2008-12-11 16:02:17 UTC")); assertNotNull(metacard.getMetadata()); assertThat( metacard.getMetadata(), containsString("Sample Excel Worksheet - Numbers and their Squares")); assertThat( metacard.getContentTypeName(), is("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); }
@Override protected Object doExecute() throws Exception { String formatString = "%1$-33s %2$-26s %3$-" + TITLE_MAX_LENGTH + "s %4$-" + EXCERPT_MAX_LENGTH + "s%n"; CatalogFacade catalogProvider = getCatalog(); Filter filter = null; if (cqlFilter != null) { filter = CQL.toFilter(cqlFilter); } else { if (searchPhrase == null) { searchPhrase = "*"; } if (caseSensitive) { filter = getFilterBuilder() .attribute(Metacard.ANY_TEXT) .is() .like() .caseSensitiveText(searchPhrase); } else { filter = getFilterBuilder().attribute(Metacard.ANY_TEXT).is().like().text(searchPhrase); } } QueryImpl query = new QueryImpl(filter); query.setRequestsTotalResultsCount(true); if (numberOfItems > -1) { query.setPageSize(numberOfItems); } long start = System.currentTimeMillis(); SourceResponse response = catalogProvider.query(new QueryRequestImpl(query)); long end = System.currentTimeMillis(); int size = 0; if (response.getResults() != null) { size = response.getResults().size(); } console.println(); console.printf( " %d result(s) out of %s%d%s in %3.3f seconds", (size), Ansi.ansi().fg(Ansi.Color.CYAN).toString(), response.getHits(), Ansi.ansi().reset().toString(), (end - start) / MILLISECONDS_PER_SECOND); console.printf(formatString, "", "", "", ""); printHeaderMessage(String.format(formatString, ID, DATE, TITLE, EXCERPT)); for (Result result : response.getResults()) { Metacard metacard = result.getMetacard(); String title = (metacard.getTitle() != null ? metacard.getTitle() : "N/A"); String excerpt = "N/A"; String modifiedDate = ""; if (searchPhrase != null) { if (metacard.getMetadata() != null) { XPathHelper helper = new XPathHelper(metacard.getMetadata()); String indexedText = helper.getDocument().getDocumentElement().getTextContent(); indexedText = indexedText.replaceAll("\\r\\n|\\r|\\n", " "); String normalizedSearchPhrase = searchPhrase.replaceAll("\\*", ""); int index = -1; if (caseSensitive) { index = indexedText.indexOf(normalizedSearchPhrase); } else { index = indexedText.toLowerCase().indexOf(normalizedSearchPhrase.toLowerCase()); } if (index != -1) { int contextLength = (EXCERPT_MAX_LENGTH - normalizedSearchPhrase.length() - 8) / 2; excerpt = "..." + indexedText.substring(Math.max(index - contextLength, 0), index); excerpt = excerpt + Ansi.ansi().fg(Ansi.Color.GREEN).toString(); excerpt = excerpt + indexedText.substring(index, index + normalizedSearchPhrase.length()); excerpt = excerpt + Ansi.ansi().reset().toString(); excerpt = excerpt + indexedText.substring( index + normalizedSearchPhrase.length(), Math.min( indexedText.length(), index + normalizedSearchPhrase.length() + contextLength)) + "..."; } } } if (metacard.getModifiedDate() != null) { modifiedDate = new DateTime(metacard.getModifiedDate().getTime()).toString(DATETIME_FORMATTER); } console.printf( formatString, metacard.getId(), modifiedDate, title.substring(0, Math.min(title.length(), TITLE_MAX_LENGTH)), excerpt); } return null; }
private Metacard getMockMetacard() { Metacard metacard = mock(Metacard.class); when(metacard.getMetadata()).thenReturn(getSample()); return metacard; }