コード例 #1
0
  public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {
      openFile(file);

      ContentHandler textHandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      ParseContext parseContext = new ParseContext();

      Parser parser = new PDFParser();
      parser.parse(getFileStream(), textHandler, metadata, parseContext);

      // Setup the document
      document.setContent(textHandler.toString(), bStoreBody);
      document.setSize((int) file.length());
      document.setType("application/pdf");

      document.setAuthor(metadata.get(Metadata.AUTHOR));
      document.setName(metadata.get(Metadata.TITLE));
      document.setSummary(metadata.get(Metadata.SUBJECT));
      document.setAttribute("keywords", metadata.get(Metadata.KEYWORDS));

      document.setId(file.getCanonicalPath());

      if (uriroot != null) document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
      throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
      throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
      throw new CrawlException("File: " + file, e);
    } finally {
      closeFile();
    }

    return document;
  }
コード例 #2
0
  @Override
  public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {
      openFile(file);

      ContentHandler textHandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      ParseContext parseContext = new ParseContext();

      Parser parser;
      if (file.getName().toLowerCase().endsWith("x")) parser = new OOXMLParser();
      else parser = new OfficeParser();

      parser.parse(getFileStream(), textHandler, metadata, parseContext);

      document.setAuthor(metadata.get(Metadata.AUTHOR));
      document.setSummary(metadata.get(Metadata.COMMENTS));
      document.setContent(textHandler.toString(), bStoreBody);
      document.setSize((int) file.length());

      document.setId(file.getCanonicalPath());

      if (uriroot != null) document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
      throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
      throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
      throw new CrawlException("File: " + file, e);
    } finally {

      closeFile();
    }

    return document;
  }
コード例 #3
0
  public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {
      MP3File mp3file = new MP3File(file);

      document.setAttribute("bitrate", String.valueOf(mp3file.getBitRate()));

      if (mp3file.hasID3v1Tag()) {
        ID3v1 id = mp3file.getID3v1Tag();

        document.setAttribute("album", id.getAlbum());
        document.setAttribute("artist", id.getArtist());
        document.setAttribute("leadartist", id.getLeadArtist());
        document.setAttribute("comment", id.getComment());
        document.setAttribute("year", id.getYearReleased());
        document.setAttribute("trackno", id.getTrackNumberOnAlbum());
        document.setName(id.getTitle());
      }

      // Setup the document
      document.setSize((int) file.length());
      document.setType("audio/mp3");
      document.setId(file.getCanonicalPath());

      if (uriroot != null) document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
      throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
      throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
      throw new CrawlException("File: " + file, e);
    }

    return document;
  }