Exemplo n.º 1
0
 /** isEmpty is true before add, false after */
 public void testEmpty() {
   NavigableSet q = set0();
   assertTrue(q.isEmpty());
   assertTrue(q.add(new Integer(1)));
   assertFalse(q.isEmpty());
   assertTrue(q.add(new Integer(2)));
   q.pollFirst();
   q.pollFirst();
   assertTrue(q.isEmpty());
 }
Exemplo n.º 2
0
 /** clear removes all elements */
 public void testDescendingClear() {
   NavigableSet q = populatedSet(SIZE);
   q.clear();
   assertTrue(q.isEmpty());
   assertEquals(0, q.size());
   assertTrue(q.add(new Integer(1)));
   assertFalse(q.isEmpty());
   q.clear();
   assertTrue(q.isEmpty());
 }
Exemplo n.º 3
0
  /** A deserialized serialized set has same elements */
  public void testSerialization() throws Exception {
    NavigableSet x = populatedSet(SIZE);
    NavigableSet y = serialClone(x);

    assertNotSame(x, y);
    assertEquals(x.size(), y.size());
    assertEquals(x, y);
    assertEquals(y, x);
    while (!x.isEmpty()) {
      assertFalse(y.isEmpty());
      assertEquals(x.pollFirst(), y.pollFirst());
    }
    assertTrue(y.isEmpty());
  }
Exemplo n.º 4
0
  /** A deserialized serialized set has same elements */
  public void testDescendingSerialization() throws Exception {
    NavigableSet x = dset5();
    NavigableSet y = serialClone(x);

    assertNotSame(x, y);
    assertEquals(x.size(), y.size());
    assertEquals(x.toString(), y.toString());
    assertEquals(x, y);
    assertEquals(y, x);
    while (!x.isEmpty()) {
      assertFalse(y.isEmpty());
      assertEquals(x.pollFirst(), y.pollFirst());
    }
    assertTrue(y.isEmpty());
  }
Exemplo n.º 5
0
  @Test
  public void WriteDBInt_lastKey_set_middle() {
    int numberOfRecords = 1000;

    /* Creates connections to MapDB */
    DB db1 = DBMaker.memoryDB().transactionDisable().make();

    /* Creates maps */
    NavigableSet<Integer> map1 = db1.treeSet("column1");

    /* Inserts initial values in maps */
    for (int i = 0; i < numberOfRecords; i++) {
      map1.add(i);
    }

    assertEquals((Object) (numberOfRecords - 1), map1.last());

    map1.clear();

    /* Inserts some values in maps */
    for (int i = 100; i < 110; i++) {
      map1.add(i);
    }

    assertEquals(10, map1.size());
    assertFalse(map1.isEmpty());
    assertEquals((Object) 109, map1.last());
    assertEquals((Object) 100, map1.first());
  }
Exemplo n.º 6
0
  @Override
  protected T prefetch() throws Exception {
    while (buffer.size() < maxBufferSize && it.hasNext()) {
      buffer.add(it.next());
    }

    return buffer.isEmpty() ? finish() : buffer.pollFirst();
  }
Exemplo n.º 7
0
 /** remove(x) removes x and returns true if present */
 public void testDescendingRemoveElement() {
   NavigableSet q = populatedSet(SIZE);
   for (int i = 1; i < SIZE; i += 2) {
     assertTrue(q.remove(new Integer(i)));
   }
   for (int i = 0; i < SIZE; i += 2) {
     assertTrue(q.remove(new Integer(i)));
     assertFalse(q.remove(new Integer(i + 1)));
   }
   assertTrue(q.isEmpty());
 }
 public void removeInterval(
     String stationName, int start, int end) { // TODO remove empty streams and stations
   NavigableSet<String> sensorNames = getSensorNames(stationName);
   if (sensorNames.isEmpty()) {
     log.info("no sensors in station -> nothing removed: " + stationName);
     return;
   }
   for (String sensorName : sensorNames) {
     log.trace("remove " + stationName + "/" + sensorName + "  " + start + "  " + end);
     removeSensorData(stationName, sensorName, start, end);
   }
 }
Exemplo n.º 9
0
  /** Returns a new set of given size containing consecutive Integers 0 ... n. */
  private NavigableSet<Integer> populatedSet(int n) {
    TreeSet<Integer> q = new TreeSet<Integer>();
    assertTrue(q.isEmpty());

    for (int i = n - 1; i >= 0; i -= 2) assertTrue(q.add(new Integer(i)));
    for (int i = (n & 1); i < n; i += 2) assertTrue(q.add(new Integer(i)));
    assertTrue(q.add(new Integer(-n)));
    assertTrue(q.add(new Integer(n)));
    NavigableSet s = q.subSet(new Integer(0), true, new Integer(n), false);
    assertFalse(s.isEmpty());
    assertEquals(n, s.size());
    return s;
  }
Exemplo n.º 10
0
 /** remove(x) removes x and returns true if present */
 public void testRemoveElement() {
   NavigableSet q = populatedSet(SIZE);
   for (int i = 1; i < SIZE; i += 2) {
     assertTrue(q.contains(i));
     assertTrue(q.remove(i));
     assertFalse(q.contains(i));
     assertTrue(q.contains(i - 1));
   }
   for (int i = 0; i < SIZE; i += 2) {
     assertTrue(q.contains(i));
     assertTrue(q.remove(i));
     assertFalse(q.contains(i));
     assertFalse(q.remove(i + 1));
     assertFalse(q.contains(i + 1));
   }
   assertTrue(q.isEmpty());
 }
Exemplo n.º 11
0
 /**
  * Transactional version of {@link HTable#get(Get)}
  *
  * @param transactionState Identifier of the transaction
  * @see HTable#get(Get)
  * @throws IOException
  */
 public Result get(TransactionState transactionState, final Get get) throws IOException {
   final long readTimestamp = transactionState.getStartTimestamp();
   final Get tsget = new Get(get.getRow());
   TimeRange timeRange = get.getTimeRange();
   long startTime = timeRange.getMin();
   long endTime = Math.min(timeRange.getMax(), readTimestamp + 1);
   //      int maxVersions = get.getMaxVersions();
   tsget
       .setTimeRange(startTime, endTime)
       .setMaxVersions((int) (versionsAvg + CACHE_VERSIONS_OVERHEAD));
   Map<byte[], NavigableSet<byte[]>> kvs = get.getFamilyMap();
   for (Map.Entry<byte[], NavigableSet<byte[]>> entry : kvs.entrySet()) {
     byte[] family = entry.getKey();
     NavigableSet<byte[]> qualifiers = entry.getValue();
     if (qualifiers == null || qualifiers.isEmpty()) {
       tsget.addFamily(family);
     } else {
       for (byte[] qualifier : qualifiers) {
         tsget.addColumn(family, qualifier);
       }
     }
   }
   //      Result result;
   //      Result filteredResult;
   //      do {
   //         result = super.get(tsget);
   //         filteredResult = filter(super.get(tsget), readTimestamp, maxVersions);
   //      } while (!result.isEmpty() && filteredResult == null);
   getsPerformed++;
   Result result =
       filter(
           transactionState,
           super.get(tsget),
           readTimestamp,
           (int) (versionsAvg + CACHE_VERSIONS_OVERHEAD));
   return result == null ? new Result() : result;
   //      Scan scan = new Scan(get);
   //      scan.setRetainDeletesInOutput(true);
   //      ResultScanner rs = this.getScanner(transactionState, scan);
   //      Result r = rs.next();
   //      if (r == null) {
   //         r = new Result();
   //      }
   //      return r;
 }
 /*
  * Check if specified KeyValue buffer has been deleted by a previously
  * seen delete.
  * @param kv
  * @return true is the specified KeyValue is deleted, false if not
  */
 private boolean isDeleted(final KeyValue kv) {
   if (this.deletes.isEmpty()) return false;
   NavigableSet<KeyValue> rowdeletes = this.deletes.get(kv);
   if (rowdeletes == null || rowdeletes.isEmpty()) return false;
   return isDeleted(kv, rowdeletes);
 }
Exemplo n.º 13
0
 public boolean isEmpty() {
   return ackedMsgs.isEmpty();
 }
Exemplo n.º 14
0
    RandomSelection select(boolean narrow, boolean mixInNotPresentItems, boolean permitReversal) {
      ThreadLocalRandom random = ThreadLocalRandom.current();
      NavigableSet<Integer> canonicalSet = this.canonical;
      BTreeSet<Integer> testAsSet = this.test;
      List<Integer> canonicalList = new ArrayList<>(canonicalSet);
      BTreeSet<Integer> testAsList = this.test;

      Assert.assertEquals(canonicalSet.size(), testAsSet.size());
      Assert.assertEquals(canonicalList.size(), testAsList.size());

      // sometimes select keys first, so we cover full range
      List<Integer> allKeys = randomKeys(canonical, mixInNotPresentItems);
      List<Integer> keys = allKeys;

      int narrowCount = random.nextInt(3);
      while (narrow && canonicalList.size() > 10 && keys.size() > 10 && narrowCount-- > 0) {
        boolean useLb = random.nextBoolean();
        boolean useUb = random.nextBoolean();
        if (!(useLb | useUb)) continue;

        // select a range smaller than the total span when we have more narrowing iterations left
        int indexRange = keys.size() / (narrowCount + 1);

        boolean lbInclusive = true;
        Integer lbKey = canonicalList.get(0);
        int lbKeyIndex = 0, lbIndex = 0;
        boolean ubInclusive = true;
        Integer ubKey = canonicalList.get(canonicalList.size() - 1);
        int ubKeyIndex = keys.size(), ubIndex = canonicalList.size();

        if (useLb) {
          lbKeyIndex = random.nextInt(0, indexRange - 1);
          Integer candidate = keys.get(lbKeyIndex);
          if (useLb = (candidate > lbKey && candidate <= ubKey)) {
            lbInclusive = random.nextBoolean();
            lbKey = keys.get(lbKeyIndex);
            lbIndex = Collections.binarySearch(canonicalList, lbKey);
            if (lbIndex >= 0 && !lbInclusive) lbIndex++;
            else if (lbIndex < 0) lbIndex = -1 - lbIndex;
          }
        }
        if (useUb) {
          ubKeyIndex =
              random.nextInt(Math.max(lbKeyIndex, keys.size() - indexRange), keys.size() - 1);
          Integer candidate = keys.get(ubKeyIndex);
          if (useUb = (candidate < ubKey && candidate >= lbKey)) {
            ubInclusive = random.nextBoolean();
            ubKey = keys.get(ubKeyIndex);
            ubIndex = Collections.binarySearch(canonicalList, ubKey);
            if (ubIndex >= 0 && ubInclusive) {
              ubIndex++;
            } else if (ubIndex < 0) ubIndex = -1 - ubIndex;
          }
        }
        if (ubIndex < lbIndex) {
          ubIndex = lbIndex;
          ubKey = lbKey;
          ubInclusive = false;
        }

        canonicalSet =
            !useLb
                ? canonicalSet.headSet(ubKey, ubInclusive)
                : !useUb
                    ? canonicalSet.tailSet(lbKey, lbInclusive)
                    : canonicalSet.subSet(lbKey, lbInclusive, ubKey, ubInclusive);
        testAsSet =
            !useLb
                ? testAsSet.headSet(ubKey, ubInclusive)
                : !useUb
                    ? testAsSet.tailSet(lbKey, lbInclusive)
                    : testAsSet.subSet(lbKey, lbInclusive, ubKey, ubInclusive);

        keys = keys.subList(lbKeyIndex, ubKeyIndex);
        canonicalList = canonicalList.subList(lbIndex, ubIndex);
        testAsList = testAsList.subList(lbIndex, ubIndex);

        Assert.assertEquals(canonicalSet.size(), testAsSet.size());
        Assert.assertEquals(canonicalList.size(), testAsList.size());
      }

      // possibly restore full set of keys, to test case where we are provided existing keys that
      // are out of bounds
      if (keys != allKeys && random.nextBoolean()) keys = allKeys;

      Comparator<Integer> comparator = naturalOrder();
      if (permitReversal && random.nextBoolean()) {
        if (allKeys != keys) keys = new ArrayList<>(keys);
        if (canonicalSet != canonical) canonicalList = new ArrayList<>(canonicalList);
        Collections.reverse(keys);
        Collections.reverse(canonicalList);
        testAsList = testAsList.descendingSet();

        canonicalSet = canonicalSet.descendingSet();
        testAsSet = testAsSet.descendingSet();
        comparator = reverseOrder();
      }

      Assert.assertEquals(canonicalSet.size(), testAsSet.size());
      Assert.assertEquals(canonicalList.size(), testAsList.size());
      if (!canonicalSet.isEmpty()) {
        Assert.assertEquals(canonicalSet.first(), canonicalList.get(0));
        Assert.assertEquals(canonicalSet.last(), canonicalList.get(canonicalList.size() - 1));
        Assert.assertEquals(canonicalSet.first(), testAsSet.first());
        Assert.assertEquals(canonicalSet.last(), testAsSet.last());
        Assert.assertEquals(canonicalSet.first(), testAsList.get(0));
        Assert.assertEquals(canonicalSet.last(), testAsList.get(testAsList.size() - 1));
      }

      return new RandomSelection(
          keys, canonicalSet, testAsSet, canonicalList, testAsList, comparator);
    }
Exemplo n.º 15
0
  public static serverObjects respond(
      final RequestHeader header, final serverObjects post, final serverSwitch env) {
    // return variable that accumulates replacements
    final serverObjects prop = new serverObjects();
    final Switchboard sb = (Switchboard) env;

    // set if this should be visible
    if (yacyBuildProperties.isPkgManager()) {
      prop.put("candeploy", "2");
      return prop;
    } else if (OS.canExecUnix || OS.isWindows) {
      // we can deploy a new system with (i.e.)
      // cd DATA/RELEASE;tar xfz $1;cp -Rf yacy/* ../../;rm -Rf yacy
      prop.put("candeploy", "1");
    } else {
      prop.put("candeploy", "0");
    }

    prop.put("candeploy_configCommit", "0");
    prop.put("candeploy_autoUpdate", "0");
    prop.put("candeploy_downloadsAvailable", "0");

    if (post != null) {
      // check if update is supposed to be installed and a release is defined
      if (post.containsKey("update") && !post.get("releaseinstall", "").isEmpty()) {
        prop.put("forwardToSteering", "1");
        prop.putHTML("forwardToSteering_release", post.get("releaseinstall", ""));
        prop.put("deploys", "1");
        prop.put("candeploy", "2"); // display nothing else
        return prop;
      }

      if (post.containsKey("downloadRelease")) {
        // download a release
        final String release = post.get("releasedownload", "");
        if (!release.isEmpty()) {
          try {
            yacyRelease versionToDownload = new yacyRelease(new DigestURI(release));

            // replace this version with version which contains public key
            final yacyRelease.DevAndMainVersions allReleases =
                yacyRelease.allReleases(false, false);
            final Set<yacyRelease> mostReleases =
                versionToDownload.isMainRelease() ? allReleases.main : allReleases.dev;
            for (final yacyRelease rel : mostReleases) {
              if (rel.equals(versionToDownload)) {
                versionToDownload = rel;
                break;
              }
            }
            versionToDownload.downloadRelease();
          } catch (final IOException e) {
            // TODO Auto-generated catch block
            Log.logException(e);
          }
        }
      }

      if (post.containsKey("checkRelease")) {
        yacyRelease.allReleases(true, false);
      }

      if (post.containsKey("deleteRelease")) {
        final String release = post.get("releaseinstall", "");
        if (!release.isEmpty()) {
          try {
            FileUtils.deletedelete(new File(sb.releasePath, release));
            FileUtils.deletedelete(new File(sb.releasePath, release + ".sig"));
          } catch (final NullPointerException e) {
            sb.getLog()
                .logSevere(
                    "AUTO-UPDATE: could not delete release " + release + ": " + e.getMessage());
          }
        }
      }

      if (post.containsKey("autoUpdate")) {
        final yacyRelease updateVersion = yacyRelease.rulebasedUpdateInfo(true);
        if (updateVersion == null) {
          prop.put("candeploy_autoUpdate", "2"); // no more recent release found
        } else {
          // there is a version that is more recent. Load it and re-start with it
          sb.getLog()
              .logInfo("AUTO-UPDATE: downloading more recent release " + updateVersion.getUrl());
          final File downloaded = updateVersion.downloadRelease();
          prop.putHTML("candeploy_autoUpdate_downloadedRelease", updateVersion.getName());
          final boolean devenvironment = new File(sb.getAppPath(), ".svn").exists();
          if (devenvironment) {
            sb.getLog()
                .logInfo("AUTO-UPDATE: omitting update because this is a development environment");
            prop.put("candeploy_autoUpdate", "3");
          } else if ((downloaded == null) || (!downloaded.exists()) || (downloaded.length() == 0)) {
            sb.getLog()
                .logInfo(
                    "AUTO-UPDATE: omitting update because download failed (file cannot be found, is too small or signature was bad)");
            prop.put("candeploy_autoUpdate", "4");
          } else {
            yacyRelease.deployRelease(downloaded);
            sb.terminate(10, "manual release update to " + downloaded.getName());
            sb.getLog().logInfo("AUTO-UPDATE: deploy and restart initiated");
            prop.put("candeploy_autoUpdate", "1");
          }
        }
      }

      if (post.containsKey("configSubmit")) {
        prop.put("candeploy_configCommit", "1");
        sb.setConfig(
            "update.process",
            ("manual".equals(post.get("updateMode", "manual"))) ? "manual" : "auto");
        sb.setConfig("update.cycle", Math.max(12, post.getLong("cycle", 168)));
        sb.setConfig("update.blacklist", post.get("blacklist", ""));
        sb.setConfig(
            "update.concept", ("any".equals(post.get("releaseType", "any"))) ? "any" : "main");
        sb.setConfig(
            "update.onlySignedFiles", (post.getBoolean("onlySignedFiles", false)) ? "1" : "0");
      }
    }

    // version information
    final String versionstring =
        yacyBuildProperties.getVersion() + "/" + yacyBuildProperties.getSVNRevision();
    prop.putHTML("candeploy_versionpp", versionstring);
    final boolean devenvironment = new File(sb.getAppPath(), ".svn").exists();
    float thisVersion = Float.parseFloat(yacyBuildProperties.getVersion());
    // cut off the SVN Rev in the Version
    try {
      thisVersion = (float) (Math.round(thisVersion * 1000.0) / 1000.0);
    } catch (final NumberFormatException e) {
    }

    // list downloaded releases
    final File[] downloadedFiles = sb.releasePath.listFiles();
    // list can be null if RELEASE directory has been deleted manually
    final int downloadedFilesNum = (downloadedFiles == null) ? 0 : downloadedFiles.length;

    prop.put(
        "candeploy_deployenabled",
        (downloadedFilesNum == 0)
            ? "0"
            : ((devenvironment) ? "1" : "2")); // prevent that a developer-version is over-deployed

    final NavigableSet<yacyRelease> downloadedReleases = new TreeSet<yacyRelease>();
    for (final File downloaded : downloadedFiles) {
      try {
        final yacyRelease release = new yacyRelease(downloaded);
        downloadedReleases.add(release);
      } catch (final RuntimeException e) {
        // not a valid release
        // can be also a restart- or deploy-file
        final File invalid = downloaded;
        if (!(invalid.getName().endsWith(".bat")
            || invalid.getName().endsWith(".sh")
            || invalid
                .getName()
                .endsWith(".sig"))) { // Windows & Linux don't like deleted scripts while execution!
          invalid.deleteOnExit();
        }
      }
    }
    // latest downloaded release
    final yacyVersion dflt = (downloadedReleases.isEmpty()) ? null : downloadedReleases.last();
    // check if there are any downloaded releases and if there are enable the update buttons
    prop.put("candeploy_downloadsAvailable", (downloadedReleases.isEmpty()) ? "0" : "1");
    prop.put(
        "candeploy_deployenabled_buttonsActive",
        (downloadedReleases.isEmpty() || devenvironment) ? "0" : "1");

    int relcount = 0;
    for (final yacyRelease release : downloadedReleases) {
      prop.put(
          "candeploy_downloadedreleases_" + relcount + "_name",
          ((release.isMainRelease()) ? "main" : "dev")
              + " "
              + release.getReleaseNr()
              + "/"
              + release.getSvn());
      prop.put(
          "candeploy_downloadedreleases_" + relcount + "_signature",
          (release.getSignatureFile().exists() ? "1" : "0"));
      prop.putHTML("candeploy_downloadedreleases_" + relcount + "_file", release.getName());
      prop.put(
          "candeploy_downloadedreleases_" + relcount + "_selected", (release == dflt) ? "1" : "0");
      relcount++;
    }
    prop.put("candeploy_downloadedreleases", relcount);

    // list remotely available releases
    final yacyRelease.DevAndMainVersions releasess = yacyRelease.allReleases(false, false);
    relcount = 0;

    final ArrayList<yacyRelease> rlist = new ArrayList<yacyRelease>();
    final Set<yacyRelease> remoteDevReleases = releasess.dev;
    remoteDevReleases.removeAll(downloadedReleases);
    for (final yacyRelease release : remoteDevReleases) {
      rlist.add(release);
    }
    final Set<yacyRelease> remoteMainReleases = releasess.main;
    remoteMainReleases.removeAll(downloadedReleases);
    for (final yacyRelease release : remoteMainReleases) {
      rlist.add(release);
    }
    yacyRelease release;
    for (int i = rlist.size() - 1; i >= 0; i--) {
      release = rlist.get(i);
      prop.put(
          "candeploy_availreleases_" + relcount + "_name",
          ((release.isMainRelease()) ? "main" : "dev")
              + " "
              + release.getReleaseNr()
              + "/"
              + release.getSvn());
      prop.put("candeploy_availreleases_" + relcount + "_url", release.getUrl().toString());
      prop.put(
          "candeploy_availreleases_" + relcount + "_signatures",
          (release.getPublicKey() != null ? "1" : "0"));
      prop.put("candeploy_availreleases_" + relcount + "_selected", (relcount == 0) ? "1" : "0");
      relcount++;
    }

    prop.put("candeploy_availreleases", relcount);

    // properties for automated system update
    prop.put(
        "candeploy_manualUpdateChecked",
        ("manual".equals(sb.getConfig("update.process", "manual"))) ? "1" : "0");
    prop.put(
        "candeploy_autoUpdateChecked",
        ("auto".equals(sb.getConfig("update.process", "manual"))) ? "1" : "0");
    prop.put("candeploy_cycle", sb.getConfigLong("update.cycle", 168));
    prop.putHTML("candeploy_blacklist", sb.getConfig("update.blacklist", ""));
    prop.put(
        "candeploy_releaseTypeMainChecked",
        ("any".equals(sb.getConfig("update.concept", "any"))) ? "0" : "1");
    prop.put(
        "candeploy_releaseTypeAnyChecked",
        ("any".equals(sb.getConfig("update.concept", "any"))) ? "1" : "0");
    prop.put("candeploy_lastlookup", (sb.getConfigLong("update.time.lookup", 0) == 0) ? "0" : "1");
    prop.put(
        "candeploy_lastlookup_time",
        new Date(sb.getConfigLong("update.time.lookup", 0)).toString());
    prop.put(
        "candeploy_lastdownload", (sb.getConfigLong("update.time.download", 0) == 0) ? "0" : "1");
    prop.put(
        "candeploy_lastdownload_time",
        new Date(sb.getConfigLong("update.time.download", 0)).toString());
    prop.put("candeploy_lastdeploy", (sb.getConfigLong("update.time.deploy", 0) == 0) ? "0" : "1");
    prop.put(
        "candeploy_lastdeploy_time",
        new Date(sb.getConfigLong("update.time.deploy", 0)).toString());
    prop.put(
        "candeploy_onlySignedFiles",
        ("1".equals(sb.getConfig("update.onlySignedFiles", "1"))) ? "1" : "0");

    /*
    if ((adminaccess) && (yacyVersion.latestRelease >= (thisVersion+0.01))) { // only new Versions(not new SVN)
        if ((yacyVersion.latestMainRelease != null) ||
            (yacyVersion.latestDevRelease != null)) {
            prop.put("hintVersionDownload", 1);
        } else if ((post != null) && (post.containsKey("aquirerelease"))) {
            yacyVersion.aquireLatestReleaseInfo();
            prop.put("hintVersionDownload", 1);
        } else {
            prop.put("hintVersionAvailable", 1);
        }
    }
    prop.put("hintVersionAvailable", 1); // for testing

    prop.putASIS("hintVersionDownload_versionResMain", (yacyVersion.latestMainRelease == null) ? "-" : yacyVersion.latestMainRelease.toAnchor());
    prop.putASIS("hintVersionDownload_versionResDev", (yacyVersion.latestDevRelease == null) ? "-" : yacyVersion.latestDevRelease.toAnchor());
    prop.put("hintVersionAvailable_latestVersion", Float.toString(yacyVersion.latestRelease));
     */

    return prop;
  }
Exemplo n.º 16
0
  public ExtractorContext extractContent() {

    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

    ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml());

    ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc()));

    String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG);

    // skip not english pages
    if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    if (ctx.getUrl().contains(YOUTUBE)) {

      String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG);

      if (keyWords == null) {
        LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS");
        return null;
      }

      keyWords = keyWords.toLowerCase();

      if (!keyWords.contains(SAP)) {
        LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS");
        return null;
      }
    }

    Document clearedHtml = cleanHtml(ctx);

    if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) {
      LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters");
      return null;
    }

    // skip pages with not English content
    if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx);

    String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text());

    PageObject pageObject = new PageObject();
    pageObject.setId(uniqueFileName);
    pageObject.setTitle(ctx.getTitle());
    pageObject.setContent(clearedHtml.html());
    pageObject.setUrl(page.getWebURL().getURL());
    pageObject.setHtmlCleanerName(ctx.getCleaner().getName());

    List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml);

    if (!blocks.isEmpty()) {
      pageObject.setMainBlock(blocks.get(0));
      blocks.remove(0);
      pageObject.addBlocks(blocks);
    }

    // set title from First text block if NULL or EMPTY.
    if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim()))
        && !pageObject.getBlocks().isEmpty()) {
      pageObject.setTitle(pageObject.getBlocks().get(0).getTitle());
    }

    pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx));

    if (!images.isEmpty()) {
      pageObject.addImages(images);
    }

    ctx.setPageObject(pageObject);

    return ctx;
  }