Example #1
0
  @Override
  protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    HSSFWorkbook workbook = new HSSFWorkbook(streamLimiter.getNewInputStream());
    ExcelExtractor excel = null;
    try {
      excel = new ExcelExtractor(workbook);
      ParserResultItem result = getNewParserResultItem();

      SummaryInformation info = excel.getSummaryInformation();
      if (info != null) {
        result.addField(ParserFieldEnum.title, info.getTitle());
        result.addField(ParserFieldEnum.author, info.getAuthor());
        result.addField(ParserFieldEnum.subject, info.getSubject());
      }

      String content = excel.getText();
      result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " "));

      result.langDetection(10000, ParserFieldEnum.content);
    } finally {
      IOUtils.close(excel);
    }
  }
 /**
  * Extraction des informations disallow du fichier robots.txt
  *
  * @param br
  * @throws IOException
  */
 @Override
 public void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {
   BufferedReader br =
       new BufferedReader(new InputStreamReader(streamLimiter.getNewInputStream()));
   String line;
   DisallowSet currentDisallowSet = null;
   while ((line = br.readLine()) != null) {
     line = line.trim();
     if (line.startsWith("#")) continue;
     if (line.length() == 0) continue;
     StringTokenizer st = new StringTokenizer(line, ":");
     if (!st.hasMoreTokens()) continue;
     String key = st.nextToken().trim();
     String value = null;
     if (!st.hasMoreTokens()) continue;
     value = st.nextToken().trim();
     if ("User-agent".equalsIgnoreCase(key)) {
       currentDisallowSet = getOrCreate(value.toLowerCase());
     } else if ("Disallow".equalsIgnoreCase(key)) {
       if (currentDisallowSet != null) currentDisallowSet.add(value);
     }
   }
   br.close();
 }