Пример #1
0
  /**
   * We don't currently support the .xlsb file format (an OOXML container with binary blobs), but we
   * shouldn't break on these files either (TIKA-826)
   */
  @Test
  public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();

    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");

    // Should be detected correctly
    MediaType type;
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
    }

    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));

    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));

    // AutoDetectParser doesn't break on it
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();
      assertEquals("", content);
    }
  }
Пример #2
0
  private void parseImage(Image image, File file) throws Exception {
    try {
      // Detects the file type
      BodyContentHandler handler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      FileInputStream inputStream = new FileInputStream(file);
      ParseContext parseContext = new ParseContext();

      // Parser
      AutoDetectParser parser = new AutoDetectParser();
      parser.parse(inputStream, handler, metadata, parseContext);

      // Image field setting
      String date;
      if (metadata.getDate(metadata.ORIGINAL_DATE) != null) {
        date = metadata.getDate(metadata.ORIGINAL_DATE).toString();
      } else if (metadata.getDate(TikaCoreProperties.CREATED) != null) {
        date = metadata.getDate(TikaCoreProperties.CREATED).toString();
      } else if (metadata.getDate(DublinCore.CREATED) != null) {
        date = metadata.getDate(DublinCore.CREATED).toString();
      } else if (metadata.getDate(TikaCoreProperties.METADATA_DATE) != null) {
        date = metadata.getDate(TikaCoreProperties.METADATA_DATE).toString();
      } else if (metadata.getDate(DublinCore.MODIFIED) != null) {
        date = metadata.getDate(DublinCore.MODIFIED).toString();
      } else {
        // Current date+time
        metadata.set(Metadata.DATE, new Date());
        date = metadata.get(Metadata.DATE);
      }
      image.setLongitude(metadata.get(Geographic.LONGITUDE));
      image.setLatitude(metadata.get(Geographic.LATITUDE));
      ImageOperations.setMetadataParsingFinished();

      if (date != null) {
        image.setDate(date.toString());
      } else {
        image.setDate(null);
      }
      image.setLongitude(image.getLongitude());
      image.setLatitude(image.getLatitude());
      aPII.reverseGeocode(image);
      ImageOperations.setReverseGeocodeFinished();
      ImageOperations iO = new ImageOperations();
      iO.doOCR(image, file);
      ImageOperations.setOcrFinished();

    } catch (IOException e) {
      System.out.println(e.getMessage());
    } catch (TikaException te) {
      System.out.println(te.getMessage());
    } catch (SAXException se) {
      System.out.println(se.getMessage());
    } catch (InterruptedException ie) {
      System.out.println(ie.getMessage());
    } catch (IM4JavaException je) {
      je.printStackTrace();
    }
  }
Пример #3
0
 /**
  * Fetch metadata from stream with already detected metadata
  *
  * @param type Mime type of data
  * @param is Data imput stream
  * @return
  */
 public static Metadata detect(final MediaType type, InputStream is) {
   AutoDetectParser parser = new AutoDetectParser((input, metadata) -> type);
   Metadata metadata = new Metadata();
   try {
     parser.parse(is, new DefaultHandler(), metadata);
   } catch (Exception e) {
     throw new RuntimeException(e);
   }
   return metadata;
 }
Пример #4
0
  private JSONObject extractTika(String contents) {

    JSONObject jObj = (JSONObject) JSONSerializer.toJSON(contents);

    if (jObj.containsKey("_source")) {
      JSONObject jObjSource = jObj.getJSONObject("_source");

      if (jObjSource.containsKey("raw_content")) {
        String rawHtml = jObjSource.getString("raw_content");

        ByteArrayInputStream bIs = new ByteArrayInputStream(rawHtml.getBytes());

        Metadata metadata = new Metadata();

        AutoDetectParser adp = new AutoDetectParser();

        ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);

        try {
          adp.parse(bIs, handler, metadata);

          String[] metadataNames = metadata.names();

          JSONObject jObjMetadata = new JSONObject();

          for (String metadataName : metadataNames) {
            String[] values = metadata.getValues(metadataName);

            JSONArray jArray = new JSONArray();
            for (String mValue : values) {
              jArray.add(mValue);
            }

            jObjMetadata.accumulate(metadataName, jArray);
          }

          // remove empty lines from the text
          String rawTextAdjusted = handler.toString().replaceAll("(?m)^[ \t]*\r?\n", "");

          // detect language
          LanguageIdentifier li = new LanguageIdentifier(rawTextAdjusted);

          jObjSource.accumulate("tikametadata", jObjMetadata);
          jObjSource.accumulate("raw_text", rawTextAdjusted);
          jObjSource.accumulate("rawtextdetectedlanguage", li.getLanguage());

        } catch (Exception e) {
          LOG.error("Error:", e);
          ;
        }
      }
    }
    return jObj;
  }
/** do NOT make public */
final class TikaImpl {

  /** subset of parsers for types we support */
  private static final Parser PARSERS[] =
      new Parser[] {
        // documents
        new org.apache.tika.parser.html.HtmlParser(),
        new org.apache.tika.parser.rtf.RTFParser(),
        new org.apache.tika.parser.pdf.PDFParser(),
        new org.apache.tika.parser.txt.TXTParser(),
        new org.apache.tika.parser.microsoft.OfficeParser(),
        new org.apache.tika.parser.microsoft.OldExcelParser(),
        new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
        new org.apache.tika.parser.odf.OpenDocumentParser(),
        new org.apache.tika.parser.iwork.IWorkPackageParser(),
        new org.apache.tika.parser.xml.DcXMLParser(),
      };

  /** autodetector based on this subset */
  private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS);

  /** singleton tika instance */
  private static final Tika TIKA_INSTANCE =
      new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE);

  /** parses with tika, throwing any exception hit while parsing the document */
  // only package private for testing!
  static String parse(final byte content[], final Metadata metadata, final int limit)
      throws TikaException, IOException {
    // check that its not unprivileged code like a script
    SecurityManager sm = System.getSecurityManager();
    if (sm != null) {
      sm.checkPermission(new SpecialPermission());
    }

    try {
      return AccessController.doPrivileged(
          new PrivilegedExceptionAction<String>() {
            @Override
            public String run() throws TikaException, IOException {
              return TIKA_INSTANCE.parseToString(StreamInput.wrap(content), metadata, limit);
            }
          });
    } catch (PrivilegedActionException e) {
      // checked exception from tika: unbox it
      Throwable cause = e.getCause();
      if (cause instanceof TikaException) {
        throw (TikaException) cause;
      } else if (cause instanceof IOException) {
        throw (IOException) cause;
      } else {
        throw new AssertionError(cause);
      }
    }
  }
}
  /*
   * (non-Javadoc)
   * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
   */
  @Override
  protected void render(RenderingContext context) {
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();

    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if (!p.getParsers().containsKey(sourceMediaType)) {
      throw new RenditionServiceException(
          "Source mime type of "
              + sourceMimeType
              + " is not supported by Tika for HTML conversions");
    }

    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
  }
Пример #7
0
  /** Prints all the known media types, aliases and matching parser classes. */
  private void displaySupportedTypes() {
    AutoDetectParser parser = new AutoDetectParser();
    MediaTypeRegistry registry = parser.getMediaTypeRegistry();
    Map<MediaType, Parser> parsers = parser.getParsers();

    for (MediaType type : registry.getTypes()) {
      System.out.println(type);
      for (MediaType alias : registry.getAliases(type)) {
        System.out.println("  alias:     " + alias);
      }
      MediaType supertype = registry.getSupertype(type);
      if (supertype != null) {
        System.out.println("  supertype: " + supertype);
      }
      Parser p = parsers.get(type);
      if (p != null) {
        System.out.println("  parser:    " + p.getClass().getName());
      }
    }
  }
Пример #8
0
  /** Excel 5 and 95 are older formats, and only get basic support */
  @Test
  public void testExcel95() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    MediaType type;
    Metadata m;

    // First try detection of Excel 5
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel", type.toString());
    }

    // Now Excel 95
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel", type.toString());
    }

    // OfficeParser can handle it
    assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));

    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));

    // Parse the Excel 5 file
    m = new Metadata();
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();

      // Sheet names
      assertContains("Feuil1", content);
      assertContains("Feuil3", content);

      // Text
      assertContains("Sample Excel", content);
      assertContains("Number", content);

      // Numbers
      assertContains("15", content);
      assertContains("225", content);

      // Metadata was also fetched
      assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
      assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
    }

    // Parse the Excel 95 file
    m = new Metadata();
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();

      // Sheet name
      assertContains("Foglio1", content);

      // Very boring file, no actual text or numbers!

      // Metadata was also fetched
      assertEquals(null, m.get(TikaCoreProperties.TITLE));
      assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
    }
  }
Пример #9
0
  @Override
  public Metacard transform(InputStream input, String uri)
      throws IOException, CatalogTransformerException {
    if (input == null) {
      throw new CatalogTransformerException("Cannot transform null input.");
    }

    MetacardImpl metacard = new MetacardImpl(BasicTypes.BASIC_METACARD);

    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();

    try {
      parser.parse(input, handler, metadata);
      String title = metadata.get(TikaCoreProperties.TITLE);
      if (LOGGER.isDebugEnabled()) {
        LOGGER.debug("Title: " + title);
        LOGGER.debug("Creator: " + metadata.get(TikaCoreProperties.CREATOR));
        LOGGER.debug("Author: " + metadata.get(Metadata.AUTHOR));
        LOGGER.debug("Creation Date: " + metadata.get(TikaCoreProperties.CREATED));
        LOGGER.debug("Modified Date: " + metadata.get(TikaCoreProperties.MODIFIED));
        LOGGER.debug("Content Type: " + metadata.get(Metadata.CONTENT_TYPE));
        // LOGGER.debug("content: " + handler.toString());
        // int count = 1;
        // for (String stringMetadata : metadata.names())
        // {
        // LOGGER.debug("Metadata " + count + " ----> name : "
        // + stringMetadata + "&nbsp; value : " + metadata.get(stringMetadata));
        // count++;
        // }
      }

      // mc.setMetadata(convertNodeToString(getDocument(jaxbDoc)));
      if (StringUtils.isEmpty(title)) {
        title = "<No title provided>";
      }
      metacard.setTitle(title);

      Date date =
          javax.xml.bind.DatatypeConverter.parseDateTime(metadata.get(TikaCoreProperties.CREATED))
              .getTime();
      metacard.setCreatedDate(date);

      date =
          javax.xml.bind.DatatypeConverter.parseDateTime(metadata.get(TikaCoreProperties.MODIFIED))
              .getTime();
      metacard.setModifiedDate(date);

      // metacard.setExpirationDate(getExpirationDate(resource));
      // metacard.setEffectiveDate(getEffectiveDate(resource));
      // metacard.setLocation(getLocation(resource));
      // metacard.setSourceId(getSourceId());
      // metacard.setResourceSize(getResourceSize(resource));
      if (uri != null) {
        metacard.setResourceURI(URI.create(uri));
      } else {
        metacard.setResourceURI(null);
      }
    } catch (SAXException e) {
      LOGGER.warn(e);
      throw new CatalogTransformerException(e);
    } catch (TikaException e) {
      LOGGER.warn(e);
      throw new CatalogTransformerException(e);
    }

    return metacard;
  }