Пример #1
0
  private void getImgLinks(HTML dequeuedHtml) {
    // TODO Auto-generated method stub
    String[] allValidExtension = m_ConfigData.getImageExtensions();
    StringBuilder extenstionAddToRegex = new StringBuilder();
    extenstionAddToRegex.append("(");
    for (int i = 0; i < allValidExtension.length; i++) {
      if (i == allValidExtension.length - 1) {
        extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|");
        extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + ")");
      } else {
        extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|");
        extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + "|");
      }
    }

    String regex = "<(img|IMG)\\s+(src|SRC)=\"(.*?\\." + extenstionAddToRegex.toString() + ")\"";
    System.out.println("Regex for image is " + regex);
    Pattern pattern = Pattern.compile(regex);
    Matcher m = pattern.matcher(dequeuedHtml.GetBody());
    System.out.println("fetching  image links");
    while (m.find()) {
      String link = m.group(3);
      System.out.println("enqueue to Downloader: " + link);
      if (!m_AllTraversedLinks.Exists(link)) {
        // m_DownloadQueue.enqueue(link);
        m_DownloadQueue.enqueue(link);
        // m_AllTraversedLinks.enqueue(link);
        System.out.println("Analyzer: adding image link: " + link);
      }
    }
    System.out.println("Done with fetching image links");
  }
Пример #2
0
 private void getUrlLinks(HTML dequeuedHtml) {
   // String regex = "<(a|A) (href|HREF)=\"(http|HTTP)://" + m_Domain +
   // "/(.*?)\">";
   String regex = "<(a|A)\\s+(href|HREF)=\"(?!#)(.*?)\">"; // TODO: do HTTP
   // prefix means
   // its external?
   Pattern pattern = Pattern.compile(regex);
   Matcher m = pattern.matcher(dequeuedHtml.GetBody());
   System.out.println("fetching links");
   while (m.find()) {
     String link = m.group(3);
     System.out.println("enqueue to Downloader: " + link);
     if (!m_AllTraversedLinks.Exists(link)) {
       // m_DownloadQueue.enqueue(link);
       m_DownloadQueue.enqueue(link);
       // m_AllTraversedLinks.enqueue(link);
       System.out.println("Analyzer: adding link: " + link);
     }
   }
   System.out.println("Done with fetching links");
 }