Exemplo n.º 1
0
  /**
   * Предполагается, что на этапе запуска приожения, если с MainUrl что-то не так то контейнер не
   * запустится :-)
   */
  @PostConstruct
  public void init() {
    try {
      mainURI = new URI(properties.getProperty("MainUrl"), true, "UTF-8");
    } catch (Exception e) {
      throw new RuntimeException(ERR_MSG + e.getMessage());
    }
    if (!mainURI.isAbsoluteURI()) {
      throw new RuntimeException(ERR_MSG + "URI not absolute path");
    }

    try {
      String mainHost = mainURI.getHost();
      if (mainHost == null) {
        throw new RuntimeException(ERR_MSG + "bad URI host");
      }
    } catch (URIException e) {
      throw new RuntimeException(ERR_MSG + e.getMessage());
    }

    try {
      secureURI =
          new URI(
              properties.getProperty("SecureUrl", mainURI.toString().replaceFirst("http", "https")),
              true,
              "UTF-8");
    } catch (Exception e) {
      throw new RuntimeException(ERR_MSG + e.getMessage());
    }
  }
Exemplo n.º 2
0
  private boolean isValidMessageToScan(HttpMessage msg) {
    if (getScannerOptions().isScanHeadersAllRequests()) {
      return true;
    }

    // First we check if it's a dynamic or static page
    // I'd to do this because scanning starts to be veeeeery slow
    // --
    // this is a trivial implementation, should be good to have
    // a page dynamic check at the parent plugin level which should
    // use or not Variants according to the behavior of the request
    // (e.g. different content or status error/redirect)
    String query = null;
    try {
      query = msg.getRequestHeader().getURI().getQuery();

    } catch (URIException e) {
      log.error(e.getMessage(), e);
    }

    // If there's almost one GET parameter go ahead
    if (query == null || query.isEmpty()) {
      // If also the Request body is null maybe it's a static page oer a null parameter page
      if (msg.getRequestBody().length() == 0) {
        return false;
      }
    }
    return true;
  }
Exemplo n.º 3
0
  /**
   * parameter와 queryString 를 가져온다.
   *
   * @param method
   * @return
   */
  private String getHttpInfoDumy(HttpMethod method) {
    NameValuePair[] params = null;
    String methodType = "GET";
    String reqBody = null;
    if (method instanceof PostMethod) {
      params = ((PostMethod) method).getParameters();
      methodType = "POST";
      StringRequestEntity sre = (StringRequestEntity) ((PostMethod) method).getRequestEntity();
      reqBody = sre.getContent();
    }

    StringBuffer sb = new StringBuffer();

    try {
      sb.append("#### getHttpInfoDumy ####");
      sb.append("\n## " + methodType + " [" + method.getURI() + "], hscd[" + this.hashCode() + "]");
    } catch (URIException e) {
      sb.append("\n## getParamsQueryStr- URIException " + e.getMessage() + "]");
      return sb.toString();
    }

    if (method.getQueryString() != null && method.getQueryString().length() > 0)
      sb.append("\n" + "## queryString[" + method.getQueryString() + "]");

    if (params != null) {
      for (int i = 0; i < params.length; i++) {
        NameValuePair param = params[i];
        sb.append(
            "\n"
                + "## POST body param["
                + i
                + "], name["
                + param.getName()
                + "], value["
                + param.getValue()
                + "]");
      }
    }

    if (reqBody != null) {
      sb.append("\n" + "## POST body String [" + reqBody + "]");
    }

    sb.append("\n##########");

    return sb.toString();
  }
  public boolean populate(CrawlURI curi, HttpClient http, HttpMethod method, String payload) {
    // http is not used.
    // payload is not used.
    boolean result = false;
    Map formItems = null;
    try {
      formItems = getFormItems(curi);
    } catch (AttributeNotFoundException e1) {
      logger.severe("Failed get of form items for " + curi);
    }
    if (formItems == null || formItems.size() <= 0) {
      try {
        logger.severe("No form items for " + method.getURI());
      } catch (URIException e) {
        logger.severe("No form items and exception getting uri: " + e.getMessage());
      }
      return result;
    }

    NameValuePair[] data = new NameValuePair[formItems.size()];
    int index = 0;
    String key = null;
    for (Iterator i = formItems.keySet().iterator(); i.hasNext(); ) {
      key = (String) i.next();
      data[index++] = new NameValuePair(key, (String) formItems.get(key));
    }
    if (method instanceof PostMethod) {
      ((PostMethod) method).setRequestBody(data);
      result = true;
    } else if (method instanceof GetMethod) {
      // Append these values to the query string.
      // Get current query string, then add data, then get it again
      // only this time its our data only... then append.
      HttpMethodBase hmb = (HttpMethodBase) method;
      String currentQuery = hmb.getQueryString();
      hmb.setQueryString(data);
      String newQuery = hmb.getQueryString();
      hmb.setQueryString(((currentQuery != null) ? currentQuery : "") + "&" + newQuery);
      result = true;
    } else {
      logger.severe("Unknown method type: " + method);
    }
    return result;
  }
 public boolean isPrerequisite(final CrawlURI curi) {
   boolean result = false;
   String curiStr = curi.getUURI().toString();
   String loginUri = getPrerequisite(curi);
   if (loginUri != null) {
     try {
       UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri);
       if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) {
         result = true;
         if (!curi.isPrerequisite()) {
           curi.setPrerequisite(true);
           logger.fine(curi + " is prereq.");
         }
       }
     } catch (URIException e) {
       logger.severe("Failed to uuri: " + curi + ", " + e.getMessage());
     }
   }
   return result;
 }
Exemplo n.º 6
0
  protected void addHeaderLink(CrawlURI curi, Header loc) {
    if (loc == null) {
      // If null, return without adding anything.
      return;
    }
    // TODO: consider possibility of multiple headers
    try {
      /**
       * 302重定向使用自定义的方法存储link
       *
       * @modify: wuliufu
       * @since : 2012-05-11
       */
      curi.createAndAddLocationLink(
          curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP);

      if (curi.getObject(URLInfo.ATTACH) != null) {
        UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue());
        logger.debug(
            "ParseHTTP: curi = "
                + curi.getUURI().toString()
                + "&& "
                + loc.getName()
                + "="
                + outUURI.toString());
        curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH));
      }

      numberOfLinksExtracted++;
    } catch (URIException e) {
      // There may not be a controller (e.g. If we're being run
      // by the extractor tool).
      if (getController() != null) {
        getController().logUriError(e, curi.getUURI(), loc.getValue());
      } else {
        logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage());
      }
    }
  }
Exemplo n.º 7
0
  /** Run method of the thread */
  public void run() {

    queue = manager.workQueue;
    while (manager.hasWorkLeft()) {

      working = false;
      // code to make the worker pause, if the pause button has been presed

      // if the stop signal has been given stop the thread
      if (stop) {
        return;
      }

      // this pasuses the thread
      synchronized (this) {
        while (pleaseWait) {
          try {
            wait();
          } catch (InterruptedException e) {
            return;
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }

      GetMethod httpget = null;
      HeadMethod httphead = null;

      try {

        work = (WorkUnit) queue.take();
        working = true;
        url = work.getWork();
        int code = 0;

        String responce = "";
        String rawResponce = "";

        // if the work is a head request
        if (work.getMethod().equalsIgnoreCase("HEAD")) {
          if (Config.debug) {
            System.out.println("DEBUG Worker[" + threadId + "]: HEAD " + url.toString());
          }

          httphead = new HeadMethod(url.toString());

          // set the custom HTTP headers
          Vector HTTPheaders = manager.getHTTPHeaders();
          for (int a = 0; a < HTTPheaders.size(); a++) {
            HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a);
            /*
             * Host header has to be set in a different way!
             */
            if (httpHeader.getHeader().startsWith("Host")) {
              httphead.getParams().setVirtualHost(httpHeader.getValue());
            } else {
              httphead.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue());
            }
          }
          httphead.setFollowRedirects(Config.followRedirects);

          /*
           * this code is used to limit the number of request/sec
           */
          if (manager.isLimitRequests()) {
            while (manager.getTotalDone()
                    / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0)
                > manager.getLimitRequestsTo()) {
              Thread.sleep(100);
            }
          }
          /*
           * Send the head request
           */
          code = httpclient.executeMethod(httphead);
          if (Config.debug) {
            System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString());
          }
          httphead.releaseConnection();

        }
        // if we are doing a get request
        else if (work.getMethod().equalsIgnoreCase("GET")) {
          // make the request;
          if (Config.debug) {
            System.out.println("DEBUG Worker[" + threadId + "]: GET " + url.toString());
          }
          httpget = new GetMethod(url.toString());

          // set the custom HTTP headers
          Vector HTTPheaders = manager.getHTTPHeaders();
          for (int a = 0; a < HTTPheaders.size(); a++) {

            HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a);
            /*
             * Host header has to be set in a different way!
             */
            if (httpHeader.getHeader().startsWith("Host")) {
              httpget.getParams().setVirtualHost(httpHeader.getValue());
            } else {
              httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue());
            }
          }
          httpget.setFollowRedirects(Config.followRedirects);

          /*
           * this code is used to limit the number of request/sec
           */
          if (manager.isLimitRequests()) {
            while (manager.getTotalDone()
                    / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0)
                > manager.getLimitRequestsTo()) {
              Thread.sleep(100);
            }
          }

          code = httpclient.executeMethod(httpget);

          if (Config.debug) {
            System.out.println("DEBUG Worker[" + threadId + "]: " + code + " " + url.toString());
          }

          // set up the input stream
          BufferedReader input =
              new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream()));

          // save the headers into a string, used in viewing raw responce
          String rawHeader;
          rawHeader = httpget.getStatusLine() + "\r\n";
          Header headers[] = httpget.getResponseHeaders();

          StringBuffer buf = new StringBuffer();
          for (int a = 0; a < headers.length; a++) {
            buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n");
          }

          rawHeader = rawHeader + buf.toString();

          buf = new StringBuffer();
          // read in the responce body
          String line;
          while ((line = input.readLine()) != null) {
            buf.append("\r\n" + line);
          }
          responce = buf.toString();
          input.close();

          rawResponce = rawHeader + responce;
          // clean the responce

          // parse the html of what we have found

          if (Config.parseHTML && !work.getBaseCaseObj().isUseRegexInstead()) {
            Header contentType = httpget.getResponseHeader("Content-Type");

            if (contentType != null) {
              if (contentType.getValue().startsWith("text")) {
                manager.addHTMLToParseQueue(new HTMLparseWorkUnit(responce, work));
              }
            }
          }

          responce = FilterResponce.CleanResponce(responce, work);

          Thread.sleep(10);
          httpget.releaseConnection();
        } else {
          // There is no need to deal with requests other than HEAD or GET
        }

        // if we need to check the against the base case
        if (work.getMethod().equalsIgnoreCase("GET")
            && work.getBaseCaseObj().useContentAnalysisMode()) {
          if (code == 200) {
            if (Config.debug) {
              System.out.println(
                  "DEBUG Worker[" + threadId + "]: Base Case Check " + url.toString());
            }

            // TODO move this option to the Adv options
            // if the responce does not match the base case
            Pattern regexFindFile = Pattern.compile(".*file not found.*", Pattern.CASE_INSENSITIVE);

            Matcher m = regexFindFile.matcher(responce);

            // need to clean the base case of the item we are looking for
            String basecase =
                FilterResponce.removeItemCheckedFor(
                    work.getBaseCaseObj().getBaseCase(), work.getItemToCheck());

            if (m.find()) {
              // do nothing as we have a 404
            } else if (!responce.equalsIgnoreCase(basecase)) {
              if (work.isDir()) {
                if (Config.debug) {
                  System.out.println(
                      "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString());
                }
                // we found a dir
                manager.foundDir(url, code, responce, basecase, rawResponce, work.getBaseCaseObj());
              } else {
                // found a file
                if (Config.debug) {
                  System.out.println(
                      "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString());
                }
                manager.foundFile(
                    url,
                    code,
                    responce,
                    work.getBaseCaseObj().getBaseCase(),
                    rawResponce,
                    work.getBaseCaseObj());
              }
            }
          } else if (code == 404 || code == 400) {
            // again do nothing as it is not there
          } else {
            if (work.isDir()) {
              if (Config.debug) {
                System.out.println(
                    "DEBUG Worker[" + threadId + "]: Found Dir (base case)" + url.toString());
              }
              // we found a dir
              manager.foundDir(
                  url,
                  code,
                  responce,
                  work.getBaseCaseObj().getBaseCase(),
                  rawResponce,
                  work.getBaseCaseObj());
            } else {
              // found a file
              if (Config.debug) {
                System.out.println(
                    "DEBUG Worker[" + threadId + "]: Found File (base case)" + url.toString());
              }
              manager.foundFile(
                  url,
                  code,
                  responce,
                  work.getBaseCaseObj().getBaseCase(),
                  rawResponce,
                  work.getBaseCaseObj());
            }
            // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code +
            // " it should have been 200");
            // manager.workDone();
          }
        }
        /*
         * use the custom regex check instead
         */
        else if (work.getBaseCaseObj().isUseRegexInstead()) {
          Pattern regexFindFile = Pattern.compile(work.getBaseCaseObj().getRegex());

          Matcher m = regexFindFile.matcher(rawResponce);
          /*
          System.out.println("======Trying to find======");
          System.out.println(work.getBaseCaseObj().getRegex());
          System.out.println("======In======");
          System.out.println(responce);
          System.out.println("======/In======");
           */
          if (m.find()) {
            // do nothing as we have a 404
            if (Config.debug) {

              System.out.println(
                  "DEBUG Worker[" + threadId + "]: Regex matched so it's a 404, " + url.toString());
            }

          } else {
            if (Config.parseHTML) {
              Header contentType = httpget.getResponseHeader("Content-Type");

              if (contentType != null) {
                if (contentType.getValue().startsWith("text")) {
                  manager.addHTMLToParseQueue(new HTMLparseWorkUnit(rawResponce, work));
                }
              }
            }
            if (work.isDir()) {
              if (Config.debug) {
                System.out.println(
                    "DEBUG Worker[" + threadId + "]: Found Dir (regex) " + url.toString());
              }
              // we found a dir
              manager.foundDir(
                  url,
                  code,
                  responce,
                  work.getBaseCaseObj().getBaseCase(),
                  rawResponce,
                  work.getBaseCaseObj());
            } else {
              // found a file
              if (Config.debug) {
                System.out.println(
                    "DEBUG Worker[" + threadId + "]: Found File (regex) " + url.toString());
              }
              manager.foundFile(
                  url,
                  code,
                  responce,
                  work.getBaseCaseObj().getBaseCase(),
                  rawResponce,
                  work.getBaseCaseObj());
            }
            // manager.foundError(url, "Base Case Mode Error - Responce code came back as " + code +
            // " it should have been 200");
            // manager.workDone();
          }

        }
        // just check the responce code
        else {
          // if is not the fail code, a 404 or a 400 then we have a possible
          if (code != work.getBaseCaseObj().getFailCode()
              && code != 404
              && code != 0
              && code != 400) {
            if (work.getMethod().equalsIgnoreCase("HEAD")) {
              if (Config.debug) {
                System.out.println(
                    "DEBUG Worker[" + threadId + "]: Getting responce via GET " + url.toString());
              }
              rawResponce = "";

              httpget = new GetMethod(url.toString());
              Vector HTTPheaders = manager.getHTTPHeaders();
              for (int a = 0; a < HTTPheaders.size(); a++) {
                HTTPHeader httpHeader = (HTTPHeader) HTTPheaders.elementAt(a);
                httpget.setRequestHeader(httpHeader.getHeader(), httpHeader.getValue());
              }
              httpget.setFollowRedirects(Config.followRedirects);

              /*
               * this code is used to limit the number of request/sec
               */
              if (manager.isLimitRequests()) {
                while (manager.getTotalDone()
                        / ((System.currentTimeMillis() - manager.getTimestarted()) / 1000.0)
                    > manager.getLimitRequestsTo()) {
                  Thread.sleep(100);
                }
              }

              int newCode = httpclient.executeMethod(httpget);

              // in some cases the second get can return a different result, than the first head
              // request!
              if (newCode != code) {
                manager.foundError(
                    url,
                    "Return code for first HEAD, is different to the second GET: "
                        + code
                        + " - "
                        + newCode);
              }

              rawResponce = "";
              // build a string version of the headers
              rawResponce = httpget.getStatusLine() + "\r\n";
              Header headers[] = httpget.getResponseHeaders();

              StringBuffer buf = new StringBuffer();
              for (int a = 0; a < headers.length; a++) {
                buf.append(headers[a].getName() + ": " + headers[a].getValue() + "\r\n");
              }

              buf.append("\r\n");

              rawResponce = rawResponce + buf.toString();

              if (httpget.getResponseContentLength() > 0) {

                // get the http body
                BufferedReader input =
                    new BufferedReader(new InputStreamReader(httpget.getResponseBodyAsStream()));

                String line;

                String tempResponce = "";

                buf = new StringBuffer();
                while ((line = input.readLine()) != null) {
                  buf.append("\r\n" + line);
                }
                tempResponce = buf.toString();
                input.close();

                rawResponce = rawResponce + tempResponce;

                Header contentType = httpget.getResponseHeader("Content-Type");

                if (Config.parseHTML) {
                  contentType = httpget.getResponseHeader("Content-Type");

                  if (contentType != null) {
                    if (contentType.getValue().startsWith("text")) {
                      manager.addHTMLToParseQueue(new HTMLparseWorkUnit(tempResponce, work));
                    }
                  }
                }
              }

              httpget.releaseConnection();
            }

            if (work.isDir()) {
              manager.foundDir(url, code, rawResponce, work.getBaseCaseObj());
            } else {
              manager.foundFile(url, code, rawResponce, work.getBaseCaseObj());
            }
          }
        }

        manager.workDone();
        Thread.sleep(20);

      } catch (NoHttpResponseException e) {
        manager.foundError(url, "NoHttpResponseException " + e.getMessage());
        manager.workDone();
      } catch (ConnectTimeoutException e) {
        manager.foundError(url, "ConnectTimeoutException " + e.getMessage());
        manager.workDone();
      } catch (URIException e) {
        manager.foundError(url, "URIException " + e.getMessage());
        manager.workDone();
      } catch (IOException e) {

        manager.foundError(url, "IOException " + e.getMessage());
        manager.workDone();
      } catch (InterruptedException e) {
        // manager.foundError(url, "InterruptedException " + e.getMessage());
        manager.workDone();
        return;
      } catch (IllegalArgumentException e) {

        e.printStackTrace();
        manager.foundError(url, "IllegalArgumentException " + e.getMessage());
        manager.workDone();
      } finally {
        if (httpget != null) {
          httpget.releaseConnection();
        }

        if (httphead != null) {
          httphead.releaseConnection();
        }
      }
    }
  }