public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser = new PDFParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); // Setup the document document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setType("application/pdf"); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setName(metadata.get(Metadata.TITLE)); document.setSummary(metadata.get(Metadata.SUBJECT)); document.setAttribute("keywords", metadata.get(Metadata.KEYWORDS)); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }
@Override public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser; if (file.getName().toLowerCase().endsWith("x")) parser = new OOXMLParser(); else parser = new OfficeParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setSummary(metadata.get(Metadata.COMMENTS)); document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }