/**
  * Return the archive file type corresponding to the filename extension in the URL, or null if
  * none.
  */
 public String getFromUrl(String url) throws MalformedURLException {
   if (StringUtil.endsWithIgnoreCase(url, ".tar.gz")) {
     return getExtMimeMap().get(".tar.gz");
   }
   String ext = UrlUtil.getFileExtension(url).toLowerCase();
   return getExtMimeMap().get("." + ext);
 }
  /**
   * A method to remove any non-canonical '..' or '.' elements in the path, as well as protecting
   * against illegal path traversal.
   *
   * @param url the raw url
   * @return String the canonicalized url
   * @throws MalformedURLException
   */
  public String canonicalizePath(String url) throws MalformedURLException {
    String canonUrl = UrlUtil.normalizeUrl(url, UrlUtil.PATH_TRAVERSAL_ACTION_THROW);
    // canonicalize "dir" and "dir/"
    // XXX if these are ever two separate nodes, this is wrong
    if (canonUrl.endsWith(UrlUtil.URL_PATH_SEPARATOR)) {
      canonUrl = canonUrl.substring(0, canonUrl.length() - 1);
    }

    return canonUrl;
  }
示例#3
0
  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }
 // code copied from HttpClientUrlConnect.createMethod - without normalizing
 // both encoded and unencoded urls will be preserved
 public String normalizeUrl(String urlString, ArchivalUnit au) throws PluginException {
   String u_str = urlString;
   /* if the urlString is not "readable" ascii (0x1F < x < 0x7f), then
    * normalize/encode the string
    */
   if (!isReadableAscii(urlString)) {
     if (log.isDebug2()) log.debug2("in:" + u_str);
     u_str = UrlUtil.encodeUri(urlString, Constants.ENCODING_UTF_8);
     if (log.isDebug2()) log.debug2("out:" + u_str);
   }
   return u_str;
 }
示例#5
0
    public void foundLink(String url) {

      m_extracted.add(url);
      try {
        String normUrl = UrlUtil.normalizeUrl(url);
        if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) {
          m_incls.add(normUrl);
        } else {
          m_excls.add(normUrl);
        }
      } catch (MalformedURLException e) {
        m_excls.add(url);
      }
    }
    /*
     * In order to find full text PDF you need to find the citation_pdf_url meta tag in the
     * abstract html pull out the pdf url normalize it (reorder params...) and find the matching
     * cached URL
     */
    protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) {
      NodeList nl = null;
      ArticleFiles af = new ArticleFiles();
      if (absCu != null && absCu.hasContent()) {
        // TEMPORARY: set absCU as default full text CU in case there is
        // no PDF CU with content; the current metadata manager currently
        // uses only the full text CU, but this will change with the new
        // metadata schema that can have multiple CUs for an article.
        af.setFullTextCu(absCu);
        af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu);
        try {
          InputStreamSource is =
              new InputStreamSource(new Stream(absCu.getUnfilteredInputStream()));
          Page pg = new Page(is);
          Lexer lx = new Lexer(pg);
          Parser parser = new Parser(lx);
          Lexer.STRICT_REMARKS = false;
          NodeFilter nf =
              new NodeFilter() {
                public boolean accept(Node node) {
                  if (!(node instanceof MetaTag)) return false;
                  MetaTag meta = (MetaTag) node;
                  if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false;
                  return true;
                }
              };
          nl = parser.extractAllNodesThatMatch(nf);
        } catch (ParserException e) {
          log.debug("Unable to parse abstract page html", e);
        } catch (UnsupportedEncodingException e) {
          log.debug("Bad encoding in abstact page html", e);
        } finally {
          absCu.release();
        }
      }
      try {
        if (nl != null) {
          if (nl.size() > 0) {
            // minimally encode URL to prevent URL constructor
            // from stripping trailing spaces
            String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent();
            URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr));
            List<String> paramList = new ArrayList<String>();
            paramList.add("fileType");
            paramList.add("fileId");
            paramList.add("fileName");
            pdfUrl = reArrangeUrlParams(pdfUrl, paramList);

            if (!pdfUrl.getHost().startsWith("www.")) {
              pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile());
            }

            // note: must leave URL encoded because that's how we store URLs
            CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString());
            if (pdfCu != null && pdfCu.hasContent()) {
              // replace absCU with pdfCU if exists and has content
              af.setFullTextCu(pdfCu);
              af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu);
            }
          }
        }
      } catch (MalformedURLException e) {
        log.debug("Badly formatted pdf url link", e);
      } catch (IllegalArgumentException e) {
        log.debug("Badly formatted pdf url link", e);
      }

      return af;
    }
示例#7
0
 protected String urlEncode(String param) {
   return UrlUtil.encodeUrl(param);
 }
示例#8
0
  /** Common request handling. */
  public void service(HttpServletRequest req, HttpServletResponse resp)
      throws ServletException, IOException {
    resetState();
    boolean success = false;
    HttpSession session = req.getSession(false);
    try {
      this.req = req;
      this.resp = resp;
      if (log.isDebug()) {
        logParams();
      }
      resp.setContentType("text/html");

      if (!mayPageBeCached()) {
        resp.setHeader("pragma", "no-cache");
        resp.setHeader("Cache-control", "no-cache");
      }

      reqURL = new URL(UrlUtil.getRequestURL(req));
      clientAddr = getLocalIPAddr();

      // check that current user has permission to run this servlet
      if (!isServletAllowed(myServletDescr())) {
        displayWarningInLieuOfPage("You are not authorized to use " + myServletDescr().heading);
        return;
      }

      // check whether servlet is disabled
      String reason = ServletUtil.servletDisabledReason(myServletDescr().getServletName());
      if (reason != null) {
        displayWarningInLieuOfPage("This function is disabled. " + reason);
        return;
      }
      if (session != null) {
        session.setAttribute(SESSION_KEY_RUNNING_SERVLET, getHeading());
        String reqHost = req.getRemoteHost();
        String forw = req.getHeader(HttpFields.__XForwardedFor);
        if (!StringUtil.isNullString(forw)) {
          reqHost += " (proxies for " + forw + ")";
        }
        session.setAttribute(SESSION_KEY_REQUEST_HOST, reqHost);
      }
      lockssHandleRequest();
      success = (errMsg == null);
    } catch (ServletException e) {
      log.error("Servlet threw", e);
      throw e;
    } catch (IOException e) {
      log.error("Servlet threw", e);
      throw e;
    } catch (RuntimeException e) {
      log.error("Servlet threw", e);
      throw e;
    } finally {
      if (session != null) {
        session.setAttribute(SESSION_KEY_RUNNING_SERVLET, null);
        session.setAttribute(LockssFormAuthenticator.__J_AUTH_ACTIVITY, TimeBase.nowMs());
      }
      if ("please".equalsIgnoreCase(req.getHeader("X-Lockss-Result"))) {
        log.debug3("X-Lockss-Result: " + (success ? "Ok" : "Fail"));
        resp.setHeader("X-Lockss-Result", success ? "Ok" : "Fail");
      }
      resetMyLocals();
      resetLocals();
    }
  }