Exemplo n.º 1
0
  public List<Contacts> readContacts(ContactsInitializer contactsInitializer) {
    Properties properties;
    String pdfinput;
    String contents;
    String[] content;
    String[] contact;

    int no_of_pages;
    int page_no = ApplicationConstants.TWO;

    PdfReader reader = null;
    try {
      properties = contactsInitializer.getProperties();
      pdfinput = properties.getProperty(ApplicationConstants.PDF_INPUT);
      reader = new PdfReader(pdfinput);

      contents = PdfTextExtractor.getTextFromPage(reader, ApplicationConstants.ONE);
      content = contents.split(ApplicationConstants.NEW_LINE, ApplicationConstants.TWO);
      contact = content[1].split(ApplicationConstants.NEW_LINE);
      genarateContacts(contact);

      no_of_pages = reader.getNumberOfPages();

      while (page_no <= no_of_pages) {
        contents = PdfTextExtractor.getTextFromPage(reader, page_no);
        contact = contents.split(ApplicationConstants.NEW_LINE);
        genarateContacts(contact);
        page_no++;
      }
      logger.info(list.size() + " contacts added to list from pdf file successfully !");
      System.out.println(list.size() + " contacts added to list from pdf file successfully !");

    } catch (Exception exception) {
      System.out.println("contacts not created !");
      logger.error("contacts not created !");
      logger.error(exception);
      exception.printStackTrace();
    } finally {
      if (reader != null) {
        try {
          reader.close();
          logger.debug(LoggerConstants.RESOURCES_RELEASED);
        } catch (Exception exception) {
          logger.error(exception);
          exception.printStackTrace();
        }
      }
    }
    return list;
  }
Exemplo n.º 2
0
 /**
  * Parses a specific area of a PDF to a plain text file.
  *
  * @param pdf the original PDF
  * @param txt the resulting text
  * @throws IOException
  */
 public void parsePdf(String pdf, String txt) throws IOException {
   PdfReader reader = new PdfReader(pdf);
   PrintWriter out = new PrintWriter(new FileOutputStream(txt));
   Rectangle rect = new Rectangle(70, 80, 490, 580);
   RenderFilter filter = new RegionTextRenderFilter(rect);
   TextExtractionStrategy strategy;
   for (int i = 1; i <= reader.getNumberOfPages(); i++) {
     strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
     out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
   }
   out.flush();
   out.close();
 }
Exemplo n.º 3
0
  private void readFile(File file) throws IOException {
    FileInputStream inputStream = new FileInputStream(file);
    PdfReader reader = new PdfReader(inputStream);
    int numberOfPages = reader.getNumberOfPages();

    for (int page = 1; page <= numberOfPages; page++) {
      String pageContent = PdfTextExtractor.getTextFromPage(reader, page);
      List<String> lines = new ArrayList<>(Arrays.asList(pageContent.split("\n")));

      for (String line : lines) {
        if (line.matches("^\\d{2}\\.\\d{2}\\.\\d{4}.+")) {
          parseLine(line);
        }
      }
    }
  }
  public static void identifySpeakersinTopic(
      Map<Integer, Integer> myHalamanStartEnd, Map<Integer, List<String>> myHalamanHash)
      throws IOException {
    if (myHalamanStartEnd == null || myHalamanHash == null) {
      throw new IOException("Missing Pre-req Maps!!");
    }
    // Itertae though each topic start/end page combination ..
    for (Integer current_page : myHalamanStartEnd.keySet()) {
      // Need to reinitialize for every new topic; otherwise weirdness ensues :P
      Map<String, String> hansard_complete_speakers;
      hansard_complete_speakers = new TreeMap<>();
      Map<String, String> hansard_unsure_speakers;
      hansard_unsure_speakers = new TreeMap<>();
      List<Map<String, String>> hansard_complete_logs;
      hansard_complete_logs = new ArrayList<>();
      List<Map<String, String>> hansard_unsure_logs;
      hansard_unsure_logs = new ArrayList<>();
      // Get the cleaned up topicbypagenumber .. which is the KEY to the Map
      String topicbyPageNumber;
      topicbyPageNumber = Utils.getTopicbyPageNumber(current_page, myHalamanHash);
      // Above is the main key: TOPIC
      // TODO: Below to initialize the 4 subkeyspace: Speakers, Log, Speakers_Maybe, Log_Maybe
      // Start iterating through all content ..
      int start_page = current_page + 1;
      int end_page = myHalamanStartEnd.get(current_page) + 1;
      out.println(
          "For current block with title: "
              + topicbyPageNumber
              + " start page is "
              + start_page
              + " and end page is "
              + end_page);
      for (int i = start_page; i <= end_page; i++) {
        out.println("Page " + i);
        out.println("===========");
        // Ensure pre-reqs of parent folders before proceeding ..
        if (Utils.createParentFoldersIfMissing(
            String.format(
                ITextBlast.working_dir + HansardParser.RESULT_FOLDER,
                HansardParser.hansard_filename,
                topicbyPageNumber))) {
          // Clean the content out of DR headers before next stage in the pipelines ..
          String content =
              Utils.prepareContentForSpeakerIdentification(
                  PdfTextExtractor.getTextFromPage(HansardParser.my_reader, i));
          // out.println(content);
          // Identify people ..
          hansard_complete_speakers.putAll(observeSpeakers(content));
          Utils.writeMergedSpeakers(
              hansard_complete_speakers,
              String.format(
                  ITextBlast.working_dir + RESULT_SPEAKERS,
                  HansardParser.hansard_filename,
                  topicbyPageNumber));
          // Identify speech block and order them out ..
          //  put under the growing array for this topic
          hansard_complete_logs.addAll(preparePage(content));
          Utils.writeMergedSpeechTranscripts(
              hansard_complete_logs,
              String.format(
                  ITextBlast.working_dir + RESULT_TRANSCRIPT,
                  HansardParser.hansard_filename,
                  topicbyPageNumber));
          // extract and write out into JSON log as per Topic
          // ... and what they say??
          // How to regexp detect paragraph ..
        }
        // DEBUG: If need to test one topic only; uncomment below ..
        // break;
      } // End loop for all the pages in the topic
      // Like in HansardCopy.java; grab an extra page if necessary
      if (!((end_page == 1) || (end_page >= HansardParser.my_reader.getNumberOfPages()))) {
        // Put the Maybe here ..
        out.println("Page " + (end_page + 1));
        out.println("===========");
        String content =
            Utils.prepareContentForSpeakerIdentification(
                PdfTextExtractor.getTextFromPage(HansardParser.my_reader, (end_page + 1)));
        // out.println(content);
        // Identify people ..
        hansard_unsure_speakers.putAll(observeSpeakers(content));
        Utils.writeMergedSpeakers(
            hansard_unsure_speakers,
            String.format(
                ITextBlast.working_dir + RESULT_SPEAKERS_UNSURE,
                HansardParser.hansard_filename,
                topicbyPageNumber));
        // Identify speech block and order them out ..
        //  put under the growing array for this topic
        hansard_unsure_logs.addAll(preparePage(content));
        Utils.writeMergedSpeechTranscripts(
            hansard_unsure_logs,
            String.format(
                ITextBlast.working_dir + RESULT_TRANSCRIPT_UNSURE,
                HansardParser.hansard_filename,
                topicbyPageNumber));
      }
      // DEBUG: First topic only ..
      // break;
    }

    out.println("Final ERR Count: " + HansardParser.my_error_count);
  }