public static WebDocument getDocument(Page page) { InputStream stream = new ByteArrayInputStream(page.getContentBytes()); try { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, page.getContentType()); metadata.set(Metadata.CONTENT_LOCATION, page.getURL().toString()); metadata.set(Metadata.LOCATION, page.getURL().toString()); metadata.set(Metadata.MIME_TYPE_MAGIC, page.getContentType()); metadata.set(Metadata.CONTENT_ENCODING, page.getContentEncoding()); metadata.set(Metadata.TIKA_MIME_FILE, page.getContentType()); String text = getTika().parseToString(stream, metadata); WebDocument wd = new WebDocument(0, metadata.get(Metadata.TITLE), text, page.getURL().toString()); stream.close(); return wd; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TikaException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { stream.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // close the stream } return null; }
private void parseImage(Image image, File file) throws Exception { try { // Detects the file type BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); FileInputStream inputStream = new FileInputStream(file); ParseContext parseContext = new ParseContext(); // Parser AutoDetectParser parser = new AutoDetectParser(); parser.parse(inputStream, handler, metadata, parseContext); // Image field setting String date; if (metadata.getDate(metadata.ORIGINAL_DATE) != null) { date = metadata.getDate(metadata.ORIGINAL_DATE).toString(); } else if (metadata.getDate(TikaCoreProperties.CREATED) != null) { date = metadata.getDate(TikaCoreProperties.CREATED).toString(); } else if (metadata.getDate(DublinCore.CREATED) != null) { date = metadata.getDate(DublinCore.CREATED).toString(); } else if (metadata.getDate(TikaCoreProperties.METADATA_DATE) != null) { date = metadata.getDate(TikaCoreProperties.METADATA_DATE).toString(); } else if (metadata.getDate(DublinCore.MODIFIED) != null) { date = metadata.getDate(DublinCore.MODIFIED).toString(); } else { // Current date+time metadata.set(Metadata.DATE, new Date()); date = metadata.get(Metadata.DATE); } image.setLongitude(metadata.get(Geographic.LONGITUDE)); image.setLatitude(metadata.get(Geographic.LATITUDE)); ImageOperations.setMetadataParsingFinished(); if (date != null) { image.setDate(date.toString()); } else { image.setDate(null); } image.setLongitude(image.getLongitude()); image.setLatitude(image.getLatitude()); aPII.reverseGeocode(image); ImageOperations.setReverseGeocodeFinished(); ImageOperations iO = new ImageOperations(); iO.doOCR(image, file); ImageOperations.setOcrFinished(); } catch (IOException e) { System.out.println(e.getMessage()); } catch (TikaException te) { System.out.println(te.getMessage()); } catch (SAXException se) { System.out.println(se.getMessage()); } catch (InterruptedException ie) { System.out.println(ie.getMessage()); } catch (IM4JavaException je) { je.printStackTrace(); } }
/** * {@inheritDoc} * * @see * org.sakaiproject.nakamura.api.solr.IndexingHandler#getDocuments(org.sakaiproject.nakamura.api.solr.RepositorySession, * org.osgi.service.event.Event) */ public Collection<SolrInputDocument> getDocuments( RepositorySession repositorySession, Event event) { LOGGER.debug("GetDocuments for {} ", event); String path = (String) event.getProperty("path"); if (ignorePath(path)) { return Collections.emptyList(); } List<SolrInputDocument> documents = Lists.newArrayList(); if (path != null) { try { Session session = repositorySession.adaptTo(Session.class); ContentManager contentManager = session.getContentManager(); Content content = contentManager.get(path); if (content != null) { SolrInputDocument doc = new SolrInputDocument(); Map<String, Object> properties = content.getProperties(); for (Entry<String, Object> p : properties.entrySet()) { String indexName = index(p); if (indexName != null) { for (Object o : convertToIndex(p)) { doc.addField(indexName, o); } } } InputStream contentStream = contentManager.getInputStream(path); if (contentStream != null) { try { String extracted = tika.parseToString(contentStream); doc.addField("content", extracted); } catch (TikaException e) { LOGGER.warn(e.getMessage(), e); } } doc.addField(_DOC_SOURCE_OBJECT, content); documents.add(doc); } } catch (ClientPoolException e) { LOGGER.warn(e.getMessage(), e); } catch (StorageClientException e) { LOGGER.warn(e.getMessage(), e); } catch (AccessDeniedException e) { LOGGER.warn(e.getMessage(), e); } catch (IOException e) { LOGGER.warn(e.getMessage(), e); } } LOGGER.debug("Got documents {} ", documents); return documents; }
private void manageDetails(final GetItemResponse response, final DataHandler stream) { InputStream is = null; ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); // metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName()); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); try { is = stream.getInputStream(); parser.parse(is, contenthandler, metadata, context); is.close(); is.reset(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } String contentAuthorValue = metadata.get(Metadata.AUTHOR); String contentAuthorKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesAuthor()); if (contentAuthorValue != null) { eventResult.setDetail(contentAuthorKey, contentAuthorValue); } String contentCreationDateValue = metadata.get(Metadata.CREATION_DATE); String contentCreationDateKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesCreationDate()); if (contentCreationDateValue != null) { eventResult.setDetail(contentCreationDateKey, contentCreationDateValue); } String contentKeywordsValue = metadata.get(Metadata.KEYWORDS); String contentKeywordsKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesKeywords()); if (contentKeywordsValue != null) { eventResult.setDetail(contentKeywordsKey, contentKeywordsValue); } String[] names = metadata.names(); /* * for (int i = 0; i < names.length; i++) { * System.out.println(names[i]); } */ }
/** @param args */ public static void main(String[] args) { // String fileLocation = "G:/asas/album/song.mp3"; String fileLocation = "C:\\Users\\Public\\Music\\Sample Music\\Kalimba.mp3"; try { InputStream input = new FileInputStream(new File(fileLocation)); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new Mp3Parser(); ParseContext parseCtx = new ParseContext(); parser.parse(input, handler, metadata, parseCtx); input.close(); // List all metadata String[] metadataNames = metadata.names(); for (String name : metadataNames) { System.out.println(name + ": " + metadata.get(name)); } // Retrieve the necessary info from metadata // Names - title, xmpDM:artist etc. - mentioned below may differ // based System.out.println("----------------------------------------------"); System.out.println("Title: " + metadata.get("title")); System.out.println("Artists: " + metadata.get("xmpDM:artist")); System.out.println("Composer : " + metadata.get("xmpDM:composer")); System.out.println("Genre : " + metadata.get("xmpDM:genre")); System.out.println("Album : " + metadata.get("xmpDM:album")); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } }
private Metadata getMetadataFromTika(Product product) throws MetExtractionException { try { File file = getProductFile(product); FileInputStream inputStream = new FileInputStream(file); org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata(); Parser parser = new AutoDetectParser(); parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new ParseContext()); return transform(tikaMetadata); } catch (FileNotFoundException e) { throw new MetExtractionException("Unable to find file: Reason: " + e.getMessage()); } catch (TikaException e) { throw new MetExtractionException("Unable to parse the document: Reason: " + e.getMessage()); } catch (SAXException e) { throw new MetExtractionException( " Unable to process the SAX events : Reason: " + e.getMessage()); } catch (IOException e) { throw new MetExtractionException( "Unable to read the document stream: Reason: " + e.getMessage()); } }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }