@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { HSSFWorkbook workbook = new HSSFWorkbook(streamLimiter.getNewInputStream()); ExcelExtractor excel = null; try { excel = new ExcelExtractor(workbook); ParserResultItem result = getNewParserResultItem(); SummaryInformation info = excel.getSummaryInformation(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String content = excel.getText(); result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(excel); } }
/** * Extraction des informations disallow du fichier robots.txt * * @param br * @throws IOException */ @Override public void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(streamLimiter.getNewInputStream())); String line; DisallowSet currentDisallowSet = null; while ((line = br.readLine()) != null) { line = line.trim(); if (line.startsWith("#")) continue; if (line.length() == 0) continue; StringTokenizer st = new StringTokenizer(line, ":"); if (!st.hasMoreTokens()) continue; String key = st.nextToken().trim(); String value = null; if (!st.hasMoreTokens()) continue; value = st.nextToken().trim(); if ("User-agent".equalsIgnoreCase(key)) { currentDisallowSet = getOrCreate(value.toLowerCase()); } else if ("Disallow".equalsIgnoreCase(key)) { if (currentDisallowSet != null) currentDisallowSet.add(value); } } br.close(); }