/** * Identifies if the Linted Page has a complete set of basic meta data This is somewhat arbitrary, * but includes Title, Description, and Preview Image URL * * <p>TODO: Consider relocating to a more consumer-specific location * * @return true if complete set */ public boolean hasBasicMetaDataSet() { boolean complete = false; LintedData data = getMetaData(); if ((data.hasKey("title") && !data.getString("title").isEmpty()) && (data.hasKey("description") && !data.getString("description").isEmpty()) && (data.hasKey("preview_image_url") && !data.getString("preview_image_url").isEmpty())) { complete = true; } return complete; }
/** * Output Linter status and meta data as a human-readable string * * @return Linter status and meta data */ public String toDebugString() { StringBuilder sb = new StringBuilder(_originalUrl); sb.append(" {\n"); sb.append("\tPARSE OK:\t\t"); sb.append(this.getParseOk()); sb.append('\n'); if (!this.getParseOk()) { sb.append("\tPARSE ERROR:\t\t"); sb.append(this.getParseError()); sb.append('\n'); } sb.append("\tALIASES:"); if (_aliases == null || _aliases.size() == 0) sb.append("\t\tNONE\n"); else { sb.append('\n'); for (String alias : _aliases) { sb.append("\t\t"); sb.append(alias); sb.append('\n'); } } sb.append("\tDEST URL:\t\t"); sb.append(this.getDestinationUrl()); sb.append('\n'); if (_metaData != null) { sb.append(_metaData.getPrettyDebugString()); } else { sb.append("\tNo meta data parsed.\n"); } sb.append("} in "); sb.append(this.getProcessingTimeForHumans()); sb.append(" s\n"); return sb.toString(); }
/** Scrapes the metadata on this page (can be called separately from {@link process} */ public void scrapeMetadata() { final String logPrefix = "[" + this.getDestinationUrl() + "] "; logger.trace(logPrefix + "Downloading and scraping page contents..."); InputStream inStr = null; HttpURLConnection connection = null; try { URL url = new URL(this.getDestinationUrl()); connection = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); connection.setConnectTimeout(LintedPage.HTTP_CONNECT_TIMEOUT); connection.setReadTimeout(LintedPage.HTTP_READ_TIMEOUT); connection.setRequestProperty("User-Agent", LintedPage.HTTP_USER_AGENT); connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); connection.setRequestProperty( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); String contentType = connection.getContentType(); if (contentType == null) contentType = "unknown"; if (!contentType.toLowerCase().contains("text/html") && !contentType.toLowerCase().contains("text/plain")) { if (contentType.toLowerCase().contains("image/png") || contentType.toLowerCase().contains("image/jpeg")) { getMetaData().put("preview_image_url", this.getDestinationUrl()); _parseOk = true; } logger.warn( logPrefix + "Not downloading or scraping page because content-type was: " + contentType); return; } int contentLength = connection.getContentLength(); if (contentLength > LintedPage.HTTP_MAX_CONTENT_LENGTH) { logger.warn( logPrefix + "Not downloading or scraping page because content-length was too large: " + Integer.toString(contentLength) + " (max: " + Integer.toString(LintedPage.HTTP_MAX_CONTENT_LENGTH) + ")"); return; } String encoding = connection.getContentEncoding(); if (encoding != null && encoding.equalsIgnoreCase("gzip")) { inStr = new GZIPInputStream(connection.getInputStream()); } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { inStr = new InflaterInputStream(connection.getInputStream(), new Inflater(true)); } else { inStr = connection.getInputStream(); } } catch (FileNotFoundException fnf) { _parseError = "HTTP ERROR 404"; logger.error(logPrefix + " " + _parseError); return; } catch (IOException ioe) { try { _parseError = "Unable to download page [HTTP ERROR " + Integer.toString(connection.getResponseCode()) + "]: " + ioe; } catch (IOException e) { // We'd get an ioexception on the above try{} clause if don't have a response code _parseError = " Unable to download page: " + ioe; } logger.error(logPrefix + " " + _parseError); return; } catch (Exception ex) { logger.error(logPrefix + "Unable to download page: " + ex); _parseError = ex.toString(); return; } ServiceParser parser = ServiceParserChainManager.getInstance().getServiceParser(this.getDestinationUrl()); parser.setRawContent(inStr); parser.setRedirectUrlList(_redirectUrlList); _parseOk = parser.parse(); _metaData = parser.getMetaData(); // Update the URL, if modified by the ServiceParser String url = _metaData.getString("url"); if (url != null && !url.isEmpty()) { _destinationUrl = url; } // Update alias URLs, if modified by the ServiceParser if (_metaData.get("alias_urls") != null) { Object[] arr = (Object[]) _metaData.get("alias_urls"); _aliases = new ArrayList<String>(Arrays.asList(Arrays.copyOf(arr, arr.length, String[].class))); } // Get any parse error from the ServiceParser String parseError = parser.getParseError(); if (parseError != null && !parseError.isEmpty()) { _parseError = parseError; } }