示例#1
0
  private void getImgLinks(HTML dequeuedHtml) {
    // TODO Auto-generated method stub
    String[] allValidExtension = m_ConfigData.getImageExtensions();
    StringBuilder extenstionAddToRegex = new StringBuilder();
    extenstionAddToRegex.append("(");
    for (int i = 0; i < allValidExtension.length; i++) {
      if (i == allValidExtension.length - 1) {
        extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|");
        extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + ")");
      } else {
        extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|");
        extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + "|");
      }
    }

    String regex = "<(img|IMG)\\s+(src|SRC)=\"(.*?\\." + extenstionAddToRegex.toString() + ")\"";
    System.out.println("Regex for image is " + regex);
    Pattern pattern = Pattern.compile(regex);
    Matcher m = pattern.matcher(dequeuedHtml.GetBody());
    System.out.println("fetching  image links");
    while (m.find()) {
      String link = m.group(3);
      System.out.println("enqueue to Downloader: " + link);
      if (!m_AllTraversedLinks.Exists(link)) {
        // m_DownloadQueue.enqueue(link);
        m_DownloadQueue.enqueue(link);
        // m_AllTraversedLinks.enqueue(link);
        System.out.println("Analyzer: adding image link: " + link);
      }
    }
    System.out.println("Done with fetching image links");
  }
示例#2
0
 public void assertConsistency() {
   if (queue instanceof SynchronizedQueue) {
     ((SynchronizedQueue) queue).assertConsistency();
   }
   int mapSize = map.size();
   int effectiveQueueSize = queue.size();
   if (effectiveQueueSize != mapSize) {
     throw new AssertionError(
         "The map size is " + mapSize + " is different from the queue size " + effectiveQueueSize);
   }
 }
示例#3
0
 private void getUrlLinks(HTML dequeuedHtml) {
   // String regex = "<(a|A) (href|HREF)=\"(http|HTTP)://" + m_Domain +
   // "/(.*?)\">";
   String regex = "<(a|A)\\s+(href|HREF)=\"(?!#)(.*?)\">"; // TODO: do HTTP
   // prefix means
   // its external?
   Pattern pattern = Pattern.compile(regex);
   Matcher m = pattern.matcher(dequeuedHtml.GetBody());
   System.out.println("fetching links");
   while (m.find()) {
     String link = m.group(3);
     System.out.println("enqueue to Downloader: " + link);
     if (!m_AllTraversedLinks.Exists(link)) {
       // m_DownloadQueue.enqueue(link);
       m_DownloadQueue.enqueue(link);
       // m_AllTraversedLinks.enqueue(link);
       System.out.println("Analyzer: adding link: " + link);
     }
   }
   System.out.println("Done with fetching links");
 }
示例#4
0
  @Override
  public void run() {

    while (ThreadManager.isRunning) { // m_DownloadQueue.getSize() !=0 ||
      // m_AnalyzeQueue.getSize() != 0) { // TODO: think about
      // this condition
      // HTML dequeuedHtml = m_AnalyzeQueue.dequeue();
      ThreadManager.updateStateAnalyzer(true);
      HTML dequeuedHtml = m_AnalyzeQueue.dequeue();
      if (dequeuedHtml != null) {
        ThreadManager.updateStateAnalyzer(false);
        System.out.println("Analyzer: dequeued: \n Header:\n" + dequeuedHtml.GetHeader());
        try {
          String contentTypeOfHtml =
              getContentTypeFromHeader(
                  dequeuedHtml.GetHeader()); // dequeuedHtml.GetContentType().toLowerCase();
          if (contentTypeOfHtml.contains(IMAGE)) { // in case of image
            // handle case of image
            System.out.println("Analzyer: it is an image");
            // TODO: check extension supported
            // handleCaseOfImage(dequeuedHtml);
            HandleHTML(dequeuedHtml, HTML.TypeOfHTML.IMAGE);

          } else if (contentTypeOfHtml.contains(VIDEO)) { // in case
            System.out.println("Analzyer: it is an video"); // of
            // Video
            // handle case of video
          } else if (contentTypeOfHtml.contains(DOCUMENT)) { // in
            // TODO: what is the command for document? <a href? <doc
            // src=? // case
            System.out.println("Analzyer: it is an document"); // //
            // of
            // Document
            // handle case of document
          } else if (contentTypeOfHtml.contains("text")) {
            HandleHTML(dequeuedHtml, HTML.TypeOfHTML.TEXT);
            // int contentLenght =
            // extractContentLength(dequeuedHtml); // TODO: should
            // be done in different method using enum
            // if (contentLenght != -1) {
            // synchronized (m_ResultWrapper) {
            // m_ResultWrapper.m_PageAggregator.m_NumOfPages++;
            // m_ResultWrapper.m_PageAggregator.m_TotalSizeOfPagesInBytes+=
            // contentLenght;
            // }
            // }
            // System.out.println("(Data of page was aggregated) content is text");
            // getImgLinks(dequeuedHtml);
            // getUrlLinks(dequeuedHtml); // TODO: can links have
            // HTTP:// ?
          } else {
            System.out.println("Downloader: Error in content type of " + dequeuedHtml.toString());
          }
        } catch (Exception e) {
          System.out.println("Couldnt get content type from: " + dequeuedHtml.GetHeader());
          System.out.println(e.getMessage());
          e.printStackTrace();
        }
      }
    }
    System.out.println("Analyzer Finished!");
    // m_DownloadQueue.unregisterProducer();

  }
  private void processRequest() throws Exception {
    while (true) {

      Socket socket = socketRequestsQueue.take();

      DataOutputStream socketOutputStream = new DataOutputStream(socket.getOutputStream());

      HtmlRequest htmlRequest = readRequest(socket);

      // If the request is empty than the socket was closed on the other side
      if (htmlRequest.equals(null)) {

        try {
          socket.close();
        } catch (Exception e) {
          System.out.println("Error on trying to close socket on empty request: " + e.toString());
        }
        continue;
      }

      HtmlResponse responseToClient;

      if (!htmlRequest.isLegalRequest) {
        // The request format is illegal
        responseToClient = respond400(htmlRequest);
      } else if (!legalRequestType(htmlRequest)) {
        // The request method is unimplemented
        responseToClient = respond501(htmlRequest);
      } else if (directRequestToResultPages(htmlRequest)) {
        responseToClient = respond403(htmlRequest);
      } else {
        if (!htmlRequest.type.equals("TRACE") && !htmlRequest.type.equals("POST")) {
          boolean isFileLegal = false;
          try {
            isFileLegal = checkIfRequestedFileLegal(htmlRequest);
          } catch (IOException e) {
            responseToClient = respond500(htmlRequest);
          }
          if (!isFileLegal) {
            responseToClient = respond404(htmlRequest);
          } else {
            responseToClient = respond200(htmlRequest);
          }
        } else {
          responseToClient = respond200(htmlRequest);
        }
      }

      try {
        // Send the status line.
        socketOutputStream.writeBytes(responseToClient.getStatusLine());

        // Send the content type line.
        socketOutputStream.writeBytes(responseToClient.getContentType());

        // Send content length.
        if (!htmlRequest.isChunked) {
          socketOutputStream.writeBytes(responseToClient.getContentLengthLine());
        }

        if (htmlRequest.isChunked) {
          socketOutputStream.writeBytes(responseToClient.getTransferEncoding());
        }
        // Send a blank line to indicate the end of the header lines.
        socketOutputStream.writeBytes(CRLF);

      } catch (Exception e) {
        System.out.println("Writing the header caused an error" + e.toString());
      }

      // Send the content of the HTTP.
      if (!htmlRequest.type.equals("HEAD")) {
        sendEntityBodyToClient(socketOutputStream, responseToClient, htmlRequest.isChunked);
      }

      // Close streams and socket.
      try {
        socketOutputStream.close();
        socket.close();
      } catch (Exception e) {
        System.out.println("closing the socket caused an error");
      }
    }
  }
 @Test
 public void testEquals() {
   SynchronizedQueue<Object> q = new SynchronizedQueue<Object>();
   assertTrue(q.equals(q));
 }