public static WebDocument getDocument(Page page) {
    InputStream stream = new ByteArrayInputStream(page.getContentBytes());
    try {
      Metadata metadata = new Metadata();
      metadata.set(Metadata.CONTENT_TYPE, page.getContentType());
      metadata.set(Metadata.CONTENT_LOCATION, page.getURL().toString());
      metadata.set(Metadata.LOCATION, page.getURL().toString());
      metadata.set(Metadata.MIME_TYPE_MAGIC, page.getContentType());
      metadata.set(Metadata.CONTENT_ENCODING, page.getContentEncoding());
      metadata.set(Metadata.TIKA_MIME_FILE, page.getContentType());

      String text = getTika().parseToString(stream, metadata);

      WebDocument wd =
          new WebDocument(0, metadata.get(Metadata.TITLE), text, page.getURL().toString());
      stream.close();
      return wd;
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (TikaException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      try {
        stream.close();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } // close the stream
    }

    return null;
  }
Пример #2
0
  private void parseImage(Image image, File file) throws Exception {
    try {
      // Detects the file type
      BodyContentHandler handler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      FileInputStream inputStream = new FileInputStream(file);
      ParseContext parseContext = new ParseContext();

      // Parser
      AutoDetectParser parser = new AutoDetectParser();
      parser.parse(inputStream, handler, metadata, parseContext);

      // Image field setting
      String date;
      if (metadata.getDate(metadata.ORIGINAL_DATE) != null) {
        date = metadata.getDate(metadata.ORIGINAL_DATE).toString();
      } else if (metadata.getDate(TikaCoreProperties.CREATED) != null) {
        date = metadata.getDate(TikaCoreProperties.CREATED).toString();
      } else if (metadata.getDate(DublinCore.CREATED) != null) {
        date = metadata.getDate(DublinCore.CREATED).toString();
      } else if (metadata.getDate(TikaCoreProperties.METADATA_DATE) != null) {
        date = metadata.getDate(TikaCoreProperties.METADATA_DATE).toString();
      } else if (metadata.getDate(DublinCore.MODIFIED) != null) {
        date = metadata.getDate(DublinCore.MODIFIED).toString();
      } else {
        // Current date+time
        metadata.set(Metadata.DATE, new Date());
        date = metadata.get(Metadata.DATE);
      }
      image.setLongitude(metadata.get(Geographic.LONGITUDE));
      image.setLatitude(metadata.get(Geographic.LATITUDE));
      ImageOperations.setMetadataParsingFinished();

      if (date != null) {
        image.setDate(date.toString());
      } else {
        image.setDate(null);
      }
      image.setLongitude(image.getLongitude());
      image.setLatitude(image.getLatitude());
      aPII.reverseGeocode(image);
      ImageOperations.setReverseGeocodeFinished();
      ImageOperations iO = new ImageOperations();
      iO.doOCR(image, file);
      ImageOperations.setOcrFinished();

    } catch (IOException e) {
      System.out.println(e.getMessage());
    } catch (TikaException te) {
      System.out.println(te.getMessage());
    } catch (SAXException se) {
      System.out.println(se.getMessage());
    } catch (InterruptedException ie) {
      System.out.println(ie.getMessage());
    } catch (IM4JavaException je) {
      je.printStackTrace();
    }
  }
  /**
   * {@inheritDoc}
   *
   * @see
   *     org.sakaiproject.nakamura.api.solr.IndexingHandler#getDocuments(org.sakaiproject.nakamura.api.solr.RepositorySession,
   *     org.osgi.service.event.Event)
   */
  public Collection<SolrInputDocument> getDocuments(
      RepositorySession repositorySession, Event event) {
    LOGGER.debug("GetDocuments for {} ", event);
    String path = (String) event.getProperty("path");
    if (ignorePath(path)) {
      return Collections.emptyList();
    }
    List<SolrInputDocument> documents = Lists.newArrayList();
    if (path != null) {
      try {
        Session session = repositorySession.adaptTo(Session.class);
        ContentManager contentManager = session.getContentManager();
        Content content = contentManager.get(path);
        if (content != null) {
          SolrInputDocument doc = new SolrInputDocument();

          Map<String, Object> properties = content.getProperties();

          for (Entry<String, Object> p : properties.entrySet()) {
            String indexName = index(p);
            if (indexName != null) {
              for (Object o : convertToIndex(p)) {
                doc.addField(indexName, o);
              }
            }
          }

          InputStream contentStream = contentManager.getInputStream(path);
          if (contentStream != null) {
            try {
              String extracted = tika.parseToString(contentStream);
              doc.addField("content", extracted);
            } catch (TikaException e) {
              LOGGER.warn(e.getMessage(), e);
            }
          }

          doc.addField(_DOC_SOURCE_OBJECT, content);
          documents.add(doc);
        }
      } catch (ClientPoolException e) {
        LOGGER.warn(e.getMessage(), e);
      } catch (StorageClientException e) {
        LOGGER.warn(e.getMessage(), e);
      } catch (AccessDeniedException e) {
        LOGGER.warn(e.getMessage(), e);
      } catch (IOException e) {
        LOGGER.warn(e.getMessage(), e);
      }
    }
    LOGGER.debug("Got documents {} ", documents);
    return documents;
  }
Пример #4
0
  private void manageDetails(final GetItemResponse response, final DataHandler stream) {

    InputStream is = null;
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();

    try {
      is = stream.getInputStream();

      parser.parse(is, contenthandler, metadata, context);
      is.close();
      is.reset();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (TikaException e) {
      e.printStackTrace();
    }

    String contentAuthorValue = metadata.get(Metadata.AUTHOR);
    String contentAuthorKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesAuthor());
    if (contentAuthorValue != null) {
      eventResult.setDetail(contentAuthorKey, contentAuthorValue);
    }

    String contentCreationDateValue = metadata.get(Metadata.CREATION_DATE);
    String contentCreationDateKey =
        currentProperties.getProperty(KpeopleLabel.getCorePropertiesCreationDate());
    if (contentCreationDateValue != null) {
      eventResult.setDetail(contentCreationDateKey, contentCreationDateValue);
    }

    String contentKeywordsValue = metadata.get(Metadata.KEYWORDS);
    String contentKeywordsKey =
        currentProperties.getProperty(KpeopleLabel.getCorePropertiesKeywords());
    if (contentKeywordsValue != null) {
      eventResult.setDetail(contentKeywordsKey, contentKeywordsValue);
    }

    String[] names = metadata.names();

    /*
     * for (int i = 0; i < names.length; i++) {
     * System.out.println(names[i]); }
     */

  }
Пример #5
0
  /** @param args */
  public static void main(String[] args) {
    // String fileLocation = "G:/asas/album/song.mp3";
    String fileLocation = "C:\\Users\\Public\\Music\\Sample Music\\Kalimba.mp3";

    try {

      InputStream input = new FileInputStream(new File(fileLocation));
      ContentHandler handler = new DefaultHandler();
      Metadata metadata = new Metadata();
      Parser parser = new Mp3Parser();
      ParseContext parseCtx = new ParseContext();
      parser.parse(input, handler, metadata, parseCtx);
      input.close();

      // List all metadata
      String[] metadataNames = metadata.names();

      for (String name : metadataNames) {
        System.out.println(name + ": " + metadata.get(name));
      }

      // Retrieve the necessary info from metadata
      // Names - title, xmpDM:artist etc. - mentioned below may differ
      // based
      System.out.println("----------------------------------------------");
      System.out.println("Title: " + metadata.get("title"));
      System.out.println("Artists: " + metadata.get("xmpDM:artist"));
      System.out.println("Composer : " + metadata.get("xmpDM:composer"));
      System.out.println("Genre : " + metadata.get("xmpDM:genre"));
      System.out.println("Album : " + metadata.get("xmpDM:album"));

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (TikaException e) {
      e.printStackTrace();
    }
  }
Пример #6
0
  private Metadata getMetadataFromTika(Product product) throws MetExtractionException {
    try {
      File file = getProductFile(product);
      FileInputStream inputStream = new FileInputStream(file);
      org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata();
      Parser parser = new AutoDetectParser();
      parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new ParseContext());
      return transform(tikaMetadata);

    } catch (FileNotFoundException e) {
      throw new MetExtractionException("Unable to find file: Reason: " + e.getMessage());
    } catch (TikaException e) {
      throw new MetExtractionException("Unable to parse the document: Reason: " + e.getMessage());
    } catch (SAXException e) {
      throw new MetExtractionException(
          " Unable to process the SAX events : Reason: " + e.getMessage());
    } catch (IOException e) {
      throw new MetExtractionException(
          "Unable to read the document stream: Reason: " + e.getMessage());
    }
  }
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }