public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser = new PDFParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); // Setup the document document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setType("application/pdf"); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setName(metadata.get(Metadata.TITLE)); document.setSummary(metadata.get(Metadata.SUBJECT)); document.setAttribute("keywords", metadata.get(Metadata.KEYWORDS)); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }
@Override public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser; if (file.getName().toLowerCase().endsWith("x")) parser = new OOXMLParser(); else parser = new OfficeParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setSummary(metadata.get(Metadata.COMMENTS)); document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }
public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { MP3File mp3file = new MP3File(file); document.setAttribute("bitrate", String.valueOf(mp3file.getBitRate())); if (mp3file.hasID3v1Tag()) { ID3v1 id = mp3file.getID3v1Tag(); document.setAttribute("album", id.getAlbum()); document.setAttribute("artist", id.getArtist()); document.setAttribute("leadartist", id.getLeadArtist()); document.setAttribute("comment", id.getComment()); document.setAttribute("year", id.getYearReleased()); document.setAttribute("trackno", id.getTrackNumberOnAlbum()); document.setName(id.getTitle()); } // Setup the document document.setSize((int) file.length()); document.setType("audio/mp3"); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } return document; }