Пример #1
0
 private void handleCaseOfImage(HTML dequeuedHtml) {
   // TODO Auto-generated method stub
   int contentLength = extractContentLength(dequeuedHtml);
   if (contentLength != -1) {
     synchronized (m_ResultWrapper.m_ImageAggregator) {
       m_ResultWrapper.m_ImageAggregator.m_NumOfImages++;
       m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes += contentLength;
     }
   } else {
     System.out.println(dequeuedHtml.GetHeader() + " will not be aggregated!!!");
   }
   // int ContentLength = -1;
   // String regex = "Content-Length:\\s+([\\d]*)";
   // Pattern pat = Pattern.compile(regex);
   // Matcher m = pat.matcher(dequeuedHtml.GetHeader());
   // if (m.find()) {
   // try {
   // ContentLength = Integer.parseInt(m.group(1));
   // System.out.println("content length is: " + ContentLength);
   // // TODO: saveSomewhere
   // synchronized (m_ResultWrapper.m_ImageAggregator) {
   // m_ResultWrapper.m_ImageAggregator.m_NumOfImages++;
   // m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes +=
   // ContentLength;
   // }
   // } catch (NumberFormatException e) {
   // // TODO: handle exception
   // System.out.println("Could not parse Content-Lenght header is malformed\n"
   // + dequeuedHtml.GetHeader());
   // }
   // }
 }
Пример #2
0
  private void HandleHTML(HTML dequeuedHtml, HTML.TypeOfHTML typeOfHtml) {
    int contentLenght = extractContentLength(dequeuedHtml);
    switch (typeOfHtml) {
      case TEXT:
        if (contentLenght != -1) {
          synchronized (m_ResultWrapper) {
            m_ResultWrapper.m_PageAggregator.m_NumOfPages++;
            m_ResultWrapper.m_PageAggregator.m_TotalSizeOfPagesInBytes += contentLenght;
          }
        }
        System.out.println("(Data of page was aggregated) content is text");
        getImgLinks(dequeuedHtml);
        getUrlLinks(dequeuedHtml); // TODO: can links have
        // HTTP:// ?
        break;
      case IMAGE:
        if (contentLenght != -1) {
          synchronized (m_ResultWrapper) {
            m_ResultWrapper.m_ImageAggregator.m_NumOfImages++;
            m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes += contentLenght;
          }
        } else { // TODO: Delete this, deubgging purposes
          System.out.println(dequeuedHtml.GetHeader() + " will not be aggregated!!!");
        }
        break;

      default:
        break;
    }
  }
Пример #3
0
  private int extractContentLength(HTML dequeuedHtml) {
    int ContentLength = -1;
    String regex = "Content-Length:\\s+([\\d]*)";
    Pattern pat = Pattern.compile(regex);
    Matcher m = pat.matcher(dequeuedHtml.GetHeader());
    if (m.find()) {
      try {
        ContentLength = Integer.parseInt(m.group(1));
        System.out.println("content length is: " + ContentLength);
      } catch (NumberFormatException e) {
        // TODO: handle exception
        System.out.println(
            "Could not parse Content-Lenght, header is malformed\n" + dequeuedHtml.GetHeader());
      }
    }

    return ContentLength;
  }
Пример #4
0
  @Override
  public void run() {

    while (ThreadManager.isRunning) { // m_DownloadQueue.getSize() !=0 ||
      // m_AnalyzeQueue.getSize() != 0) { // TODO: think about
      // this condition
      // HTML dequeuedHtml = m_AnalyzeQueue.dequeue();
      ThreadManager.updateStateAnalyzer(true);
      HTML dequeuedHtml = m_AnalyzeQueue.dequeue();
      if (dequeuedHtml != null) {
        ThreadManager.updateStateAnalyzer(false);
        System.out.println("Analyzer: dequeued: \n Header:\n" + dequeuedHtml.GetHeader());
        try {
          String contentTypeOfHtml =
              getContentTypeFromHeader(
                  dequeuedHtml.GetHeader()); // dequeuedHtml.GetContentType().toLowerCase();
          if (contentTypeOfHtml.contains(IMAGE)) { // in case of image
            // handle case of image
            System.out.println("Analzyer: it is an image");
            // TODO: check extension supported
            // handleCaseOfImage(dequeuedHtml);
            HandleHTML(dequeuedHtml, HTML.TypeOfHTML.IMAGE);

          } else if (contentTypeOfHtml.contains(VIDEO)) { // in case
            System.out.println("Analzyer: it is an video"); // of
            // Video
            // handle case of video
          } else if (contentTypeOfHtml.contains(DOCUMENT)) { // in
            // TODO: what is the command for document? <a href? <doc
            // src=? // case
            System.out.println("Analzyer: it is an document"); // //
            // of
            // Document
            // handle case of document
          } else if (contentTypeOfHtml.contains("text")) {
            HandleHTML(dequeuedHtml, HTML.TypeOfHTML.TEXT);
            // int contentLenght =
            // extractContentLength(dequeuedHtml); // TODO: should
            // be done in different method using enum
            // if (contentLenght != -1) {
            // synchronized (m_ResultWrapper) {
            // m_ResultWrapper.m_PageAggregator.m_NumOfPages++;
            // m_ResultWrapper.m_PageAggregator.m_TotalSizeOfPagesInBytes+=
            // contentLenght;
            // }
            // }
            // System.out.println("(Data of page was aggregated) content is text");
            // getImgLinks(dequeuedHtml);
            // getUrlLinks(dequeuedHtml); // TODO: can links have
            // HTTP:// ?
          } else {
            System.out.println("Downloader: Error in content type of " + dequeuedHtml.toString());
          }
        } catch (Exception e) {
          System.out.println("Couldnt get content type from: " + dequeuedHtml.GetHeader());
          System.out.println(e.getMessage());
          e.printStackTrace();
        }
      }
    }
    System.out.println("Analyzer Finished!");
    // m_DownloadQueue.unregisterProducer();

  }