/** * Return the archive file type corresponding to the filename extension in the URL, or null if * none. */ public String getFromUrl(String url) throws MalformedURLException { if (StringUtil.endsWithIgnoreCase(url, ".tar.gz")) { return getExtMimeMap().get(".tar.gz"); } String ext = UrlUtil.getFileExtension(url).toLowerCase(); return getExtMimeMap().get("." + ext); }
/** * A method to remove any non-canonical '..' or '.' elements in the path, as well as protecting * against illegal path traversal. * * @param url the raw url * @return String the canonicalized url * @throws MalformedURLException */ public String canonicalizePath(String url) throws MalformedURLException { String canonUrl = UrlUtil.normalizeUrl(url, UrlUtil.PATH_TRAVERSAL_ACTION_THROW); // canonicalize "dir" and "dir/" // XXX if these are ever two separate nodes, this is wrong if (canonUrl.endsWith(UrlUtil.URL_PATH_SEPARATOR)) { canonUrl = canonUrl.substring(0, canonUrl.length() - 1); } return canonUrl; }
private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } }
// code copied from HttpClientUrlConnect.createMethod - without normalizing // both encoded and unencoded urls will be preserved public String normalizeUrl(String urlString, ArchivalUnit au) throws PluginException { String u_str = urlString; /* if the urlString is not "readable" ascii (0x1F < x < 0x7f), then * normalize/encode the string */ if (!isReadableAscii(urlString)) { if (log.isDebug2()) log.debug2("in:" + u_str); u_str = UrlUtil.encodeUri(urlString, Constants.ENCODING_UTF_8); if (log.isDebug2()) log.debug2("out:" + u_str); } return u_str; }
public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } }
/* * In order to find full text PDF you need to find the citation_pdf_url meta tag in the * abstract html pull out the pdf url normalize it (reorder params...) and find the matching * cached URL */ protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) { NodeList nl = null; ArticleFiles af = new ArticleFiles(); if (absCu != null && absCu.hasContent()) { // TEMPORARY: set absCU as default full text CU in case there is // no PDF CU with content; the current metadata manager currently // uses only the full text CU, but this will change with the new // metadata schema that can have multiple CUs for an article. af.setFullTextCu(absCu); af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu); try { InputStreamSource is = new InputStreamSource(new Stream(absCu.getUnfilteredInputStream())); Page pg = new Page(is); Lexer lx = new Lexer(pg); Parser parser = new Parser(lx); Lexer.STRICT_REMARKS = false; NodeFilter nf = new NodeFilter() { public boolean accept(Node node) { if (!(node instanceof MetaTag)) return false; MetaTag meta = (MetaTag) node; if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false; return true; } }; nl = parser.extractAllNodesThatMatch(nf); } catch (ParserException e) { log.debug("Unable to parse abstract page html", e); } catch (UnsupportedEncodingException e) { log.debug("Bad encoding in abstact page html", e); } finally { absCu.release(); } } try { if (nl != null) { if (nl.size() > 0) { // minimally encode URL to prevent URL constructor // from stripping trailing spaces String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent(); URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr)); List<String> paramList = new ArrayList<String>(); paramList.add("fileType"); paramList.add("fileId"); paramList.add("fileName"); pdfUrl = reArrangeUrlParams(pdfUrl, paramList); if (!pdfUrl.getHost().startsWith("www.")) { pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile()); } // note: must leave URL encoded because that's how we store URLs CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString()); if (pdfCu != null && pdfCu.hasContent()) { // replace absCU with pdfCU if exists and has content af.setFullTextCu(pdfCu); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu); } } } } catch (MalformedURLException e) { log.debug("Badly formatted pdf url link", e); } catch (IllegalArgumentException e) { log.debug("Badly formatted pdf url link", e); } return af; }
protected String urlEncode(String param) { return UrlUtil.encodeUrl(param); }
/** Common request handling. */ public void service(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { resetState(); boolean success = false; HttpSession session = req.getSession(false); try { this.req = req; this.resp = resp; if (log.isDebug()) { logParams(); } resp.setContentType("text/html"); if (!mayPageBeCached()) { resp.setHeader("pragma", "no-cache"); resp.setHeader("Cache-control", "no-cache"); } reqURL = new URL(UrlUtil.getRequestURL(req)); clientAddr = getLocalIPAddr(); // check that current user has permission to run this servlet if (!isServletAllowed(myServletDescr())) { displayWarningInLieuOfPage("You are not authorized to use " + myServletDescr().heading); return; } // check whether servlet is disabled String reason = ServletUtil.servletDisabledReason(myServletDescr().getServletName()); if (reason != null) { displayWarningInLieuOfPage("This function is disabled. " + reason); return; } if (session != null) { session.setAttribute(SESSION_KEY_RUNNING_SERVLET, getHeading()); String reqHost = req.getRemoteHost(); String forw = req.getHeader(HttpFields.__XForwardedFor); if (!StringUtil.isNullString(forw)) { reqHost += " (proxies for " + forw + ")"; } session.setAttribute(SESSION_KEY_REQUEST_HOST, reqHost); } lockssHandleRequest(); success = (errMsg == null); } catch (ServletException e) { log.error("Servlet threw", e); throw e; } catch (IOException e) { log.error("Servlet threw", e); throw e; } catch (RuntimeException e) { log.error("Servlet threw", e); throw e; } finally { if (session != null) { session.setAttribute(SESSION_KEY_RUNNING_SERVLET, null); session.setAttribute(LockssFormAuthenticator.__J_AUTH_ACTIVITY, TimeBase.nowMs()); } if ("please".equalsIgnoreCase(req.getHeader("X-Lockss-Result"))) { log.debug3("X-Lockss-Result: " + (success ? "Ok" : "Fail")); resp.setHeader("X-Lockss-Result", success ? "Ok" : "Fail"); } resetMyLocals(); resetLocals(); } }