Exemple #1
0
  /**
   * Identifies if the Linted Page has a complete set of basic meta data This is somewhat arbitrary,
   * but includes Title, Description, and Preview Image URL
   *
   * <p>TODO: Consider relocating to a more consumer-specific location
   *
   * @return true if complete set
   */
  public boolean hasBasicMetaDataSet() {
    boolean complete = false;

    LintedData data = getMetaData();
    if ((data.hasKey("title") && !data.getString("title").isEmpty())
        && (data.hasKey("description") && !data.getString("description").isEmpty())
        && (data.hasKey("preview_image_url") && !data.getString("preview_image_url").isEmpty())) {
      complete = true;
    }
    return complete;
  }
Exemple #2
0
  /** Scrapes the metadata on this page (can be called separately from {@link process} */
  public void scrapeMetadata() {
    final String logPrefix = "[" + this.getDestinationUrl() + "] ";

    logger.trace(logPrefix + "Downloading and scraping page contents...");

    InputStream inStr = null;
    HttpURLConnection connection = null;
    try {
      URL url = new URL(this.getDestinationUrl());
      connection = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
      connection.setConnectTimeout(LintedPage.HTTP_CONNECT_TIMEOUT);
      connection.setReadTimeout(LintedPage.HTTP_READ_TIMEOUT);
      connection.setRequestProperty("User-Agent", LintedPage.HTTP_USER_AGENT);
      connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
      connection.setRequestProperty(
          "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");

      String contentType = connection.getContentType();
      if (contentType == null) contentType = "unknown";
      if (!contentType.toLowerCase().contains("text/html")
          && !contentType.toLowerCase().contains("text/plain")) {

        if (contentType.toLowerCase().contains("image/png")
            || contentType.toLowerCase().contains("image/jpeg")) {
          getMetaData().put("preview_image_url", this.getDestinationUrl());
          _parseOk = true;
        }

        logger.warn(
            logPrefix
                + "Not downloading or scraping page because content-type was: "
                + contentType);
        return;
      }

      int contentLength = connection.getContentLength();
      if (contentLength > LintedPage.HTTP_MAX_CONTENT_LENGTH) {
        logger.warn(
            logPrefix
                + "Not downloading or scraping page because content-length was too large: "
                + Integer.toString(contentLength)
                + " (max: "
                + Integer.toString(LintedPage.HTTP_MAX_CONTENT_LENGTH)
                + ")");
        return;
      }

      String encoding = connection.getContentEncoding();
      if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
        inStr = new GZIPInputStream(connection.getInputStream());
      } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
        inStr = new InflaterInputStream(connection.getInputStream(), new Inflater(true));
      } else {
        inStr = connection.getInputStream();
      }
    } catch (FileNotFoundException fnf) {
      _parseError = "HTTP ERROR 404";
      logger.error(logPrefix + " " + _parseError);
      return;
    } catch (IOException ioe) {
      try {
        _parseError =
            "Unable to download page [HTTP ERROR "
                + Integer.toString(connection.getResponseCode())
                + "]: "
                + ioe;
      } catch (IOException e) {
        // We'd get an ioexception on the above try{} clause if don't have a response code
        _parseError = " Unable to download page: " + ioe;
      }
      logger.error(logPrefix + " " + _parseError);
      return;
    } catch (Exception ex) {
      logger.error(logPrefix + "Unable to download page: " + ex);
      _parseError = ex.toString();
      return;
    }

    ServiceParser parser =
        ServiceParserChainManager.getInstance().getServiceParser(this.getDestinationUrl());
    parser.setRawContent(inStr);
    parser.setRedirectUrlList(_redirectUrlList);
    _parseOk = parser.parse();
    _metaData = parser.getMetaData();

    // Update the URL, if modified by the ServiceParser
    String url = _metaData.getString("url");
    if (url != null && !url.isEmpty()) {
      _destinationUrl = url;
    }

    // Update alias URLs, if modified by the ServiceParser
    if (_metaData.get("alias_urls") != null) {
      Object[] arr = (Object[]) _metaData.get("alias_urls");
      _aliases =
          new ArrayList<String>(Arrays.asList(Arrays.copyOf(arr, arr.length, String[].class)));
    }

    // Get any parse error from the ServiceParser
    String parseError = parser.getParseError();
    if (parseError != null && !parseError.isEmpty()) {
      _parseError = parseError;
    }
  }