/**
   * 属性划分完毕,进行数据的移除
   *
   * @param srcData 源数据
   * @param attrName 划分的属性名称
   * @param valueType 属性的值类型
   */
  private String[][] removeData(String[][] srcData, String attrName, String valueType) {
    String[][] desDataArray;
    ArrayList<String[]> desData = Lists.newArrayList();
    // 待删除数据
    ArrayList<String[]> selectData = Lists.newArrayList();
    selectData.add(attrNames);

    // 数组数据转化到列表中,方便移除
    for (int i = 0; i < srcData.length; i++) {
      desData.add(srcData[i]);
    }

    // 还是从左往右一列列的查找
    for (int j = 1; j < attrNames.length; j++) {
      if (attrNames[j].equals(attrName)) {
        for (int i = 1; i < desData.size(); i++) {
          if (desData.get(i)[j].equals(valueType)) {
            // 如果匹配这个数据,则移除其他的数据
            selectData.add(desData.get(i));
          }
        }
      }
    }

    desDataArray = new String[selectData.size()][];
    selectData.toArray(desDataArray);

    return desDataArray;
  }
  /**
   * get a string representation of what is in the to IndexQueue.
   *
   * @return a list of tasks in the index queue.
   */
  public Collection<String> getToIndexQueue() {
    ArrayList<String> newArrayList = Lists.newArrayList();
    for (Future<String> f : this.indexQueue) {
      newArrayList.add(f.toString());
    }

    return newArrayList;
  }
  /**
   * This method should use the configuration settings to maintain the cache of the content manager
   * object.
   *
   * @param versionJustIndexed - the version we just indexed.
   */
  public synchronized void cleanupCache(final String versionJustIndexed) {
    int maxCacheSize = Integer.parseInt(properties.getProperty(Constants.MAX_VERSIONS_TO_CACHE));

    // clean up task queue
    for (Future<?> future : this.indexQueue) {
      if (future.isDone() || future.isCancelled()) {
        this.indexQueue.remove(future);
      }
    }
    log.info("Index job queue currently of size (" + this.indexQueue.size() + ")");

    // first check if our cache is bigger than we want it to be
    if (contentManager.getCachedVersionList().size() > maxCacheSize) {
      log.info(
          "Cache is too full ("
              + contentManager.getCachedVersionList().size()
              + ") finding and deleting old versions");

      // Now we want to decide which versions we can safely get rid of.
      List<String> allCachedVersions = Lists.newArrayList(contentManager.getCachedVersionList());
      // sort them so they are in ascending order with the oldest version first.
      Collections.sort(
          allCachedVersions,
          new Comparator<String>() {
            @Override
            public int compare(final String arg0, final String arg1) {
              return contentManager.compareTo(arg0, arg1);
            }
          });

      for (String version : allCachedVersions) {
        // we want to stop when we have deleted enough.
        if (contentManager.getCachedVersionList().size() <= maxCacheSize) {
          log.info("Cache clear complete");
          break;
        }

        // check we are not deleting the version that is currently
        // in use before we delete it.
        if (!isVersionInUse(version) && !versionJustIndexed.equals(version)) {
          log.info("Requesting to delete the content at version " + version + " from the cache.");
          contentManager.clearCache(version);
        }
      }

      // we couldn't free up enough space
      if (contentManager.getCachedVersionList().size() > maxCacheSize) {
        log.warn(
            "Warning unable to reduce cache to target size: current cache size is "
                + contentManager.getCachedVersionList().size());
      }
    } else {
      log.info(
          "Not evicting cache as we have enough space: current cache size is "
              + contentManager.getCachedVersionList().size()
              + ".");
    }
  }
  /**
   * 开始构建决策树
   *
   * @param isID3 是否采用ID3算法构架决策树
   */
  public void startBuildingTree(boolean isID3) {
    readDataFile();
    initAttrValue();

    ArrayList<String> remainAttr = Lists.newArrayList();
    // 添加属性,除了最后一个类标号属性
    for (int i = 1; i < attrNames.length - 1; i++) {
      remainAttr.add(attrNames[i]);
    }

    AttrNode rootNode = new AttrNode();
    buildDecisionTree(rootNode, "", data, remainAttr, isID3);
    showDecisionTree(rootNode, 1);
  }
  @Override
  public void importEntities(List<Entity> entities) {
    String defaultNamespace = NamespaceManager.get();
    List<Entity> copies = Lists.newArrayList();

    for (Entity entity : entities) {
      // Entities imported might not have the correct appId, we need to make a copy
      copies.add(createEntityCopy(entity));
    }

    entitiesService.putAsync(copies);

    NamespaceManager.set(defaultNamespace);
  }
  private boolean addEntry(
      final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
    boolean success = false;

    // lock on feed, make sure we are not updating the same feed twice at
    // the same time
    String key1 = StringUtils.trimToEmpty("" + feed.getId());

    // lock on content, make sure we are not updating the same entry
    // twice at the same time
    FeedEntryContent content = entry.getContent();
    String key2 =
        DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));

    Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
    Lock lock1 = iterator.next();
    Lock lock2 = iterator.next();
    boolean locked1 = false;
    boolean locked2 = false;
    try {
      locked1 = lock1.tryLock(1, TimeUnit.MINUTES);
      locked2 = lock2.tryLock(1, TimeUnit.MINUTES);
      if (locked1 && locked2) {
        feedUpdateService.updateEntry(feed, entry);
        List<User> users = Lists.newArrayList();
        for (FeedSubscription sub : subscriptions) {
          users.add(sub.getUser());
        }
        cache.invalidateUnreadCount(subscriptions.toArray(new FeedSubscription[0]));
        cache.invalidateUserRootCategory(users.toArray(new User[0]));
        metricsBean.entryInserted();
        success = true;
      } else {
        log.error("lock timeout for " + feed.getUrl() + " - " + key1);
      }
    } catch (InterruptedException e) {
      log.error(
          "interrupted while waiting for lock for " + feed.getUrl() + " : " + e.getMessage(), e);
    } finally {
      if (locked1) {
        lock1.unlock();
      }
      if (locked2) {
        lock2.unlock();
      }
    }
    return success;
  }
  private void testBase(String[] ARGS) throws IOException, GeneralSecurityException {
    // Run the pipeline.
    VariantSimilarity.main(ARGS);

    // Download the pipeline results.
    List<GraphResult> results = Lists.newArrayList();
    for (GcsPath path : helper.gcsUtil.expand(GcsPath.fromUri(outputPrefix + "*"))) {
      BufferedReader reader = helper.openOutput(path.toString());
      for (String line = reader.readLine(); line != null; line = reader.readLine()) {
        results.add(GraphResult.fromString(line));
      }
    }

    // Check the pipeline results.
    assertEquals(helper.PLATINUM_GENOMES_NUMBER_OF_SAMPLES, results.size());

    assertThat(results, CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT)));
  }
  public Feed subscribe(User user, String url, String title, FeedCategory category) {

    final String pubUrl = applicationSettingsService.get().getPublicUrl();
    if (StringUtils.isBlank(pubUrl)) {
      throw new FeedSubscriptionException("Public URL of this CommaFeed instance is not set");
    }
    if (url.startsWith(pubUrl)) {
      throw new FeedSubscriptionException(
          "Could not subscribe to a feed from this CommaFeed instance");
    }

    Feed feed = feedService.findOrCreate(url);

    FeedSubscription sub = feedSubscriptionDAO.findByFeed(user, feed);
    boolean newSubscription = false;
    if (sub == null) {
      sub = new FeedSubscription();
      sub.setFeed(feed);
      sub.setUser(user);
      newSubscription = true;
    }
    sub.setCategory(category);
    sub.setPosition(0);
    sub.setTitle(FeedUtils.truncate(title, 128));
    feedSubscriptionDAO.saveOrUpdate(sub);

    if (newSubscription) {
      List<FeedEntryStatus> statuses = Lists.newArrayList();
      List<FeedEntry> allEntries = feedEntryDAO.findByFeed(feed, 0, 10);
      for (FeedEntry entry : allEntries) {
        FeedEntryStatus status = new FeedEntryStatus();
        status.setEntry(entry);
        status.setRead(false);
        status.setSubscription(sub);
        statuses.add(status);
      }
      feedEntryStatusDAO.saveOrUpdate(statuses);
    }
    taskGiver.add(feed);
    return feed;
  }
    @Override
    public void run() {
      boolean ok = true;
      Feed feed = context.getFeed();
      List<FeedEntry> entries = context.getEntries();
      if (entries.isEmpty() == false) {

        List<String> lastEntries = cache.getLastEntries(feed);
        List<String> currentEntries = Lists.newArrayList();

        List<FeedSubscription> subscriptions = null;
        for (FeedEntry entry : entries) {
          String cacheKey = cache.buildUniqueEntryKey(feed, entry);
          if (!lastEntries.contains(cacheKey)) {
            log.debug("cache miss for {}", entry.getUrl());
            if (subscriptions == null) {
              subscriptions = feedSubscriptionDAO.findByFeed(feed);
            }
            ok &= addEntry(feed, entry, subscriptions);
            metricsBean.entryCacheMiss();
          } else {
            log.debug("cache hit for {}", entry.getUrl());
            metricsBean.entryCacheHit();
          }
          currentEntries.add(cacheKey);
        }
        cache.setLastEntries(feed, currentEntries);
      }

      if (applicationSettingsService.get().isPubsubhubbub()) {
        handlePubSub(feed);
      }
      if (!ok) {
        // requeue asap
        feed.setDisabledUntil(new Date(0));
      }
      metricsBean.feedUpdated();
      taskGiver.giveBack(feed);
    }
    @Override
    public void onActivityCreated(Bundle savedInstanceState) {
      super.onActivityCreated(savedInstanceState);
      // setup credential store
      SharedPreferencesCredentialStore credentialStore =
          new SharedPreferencesCredentialStore(
              getActivity(), SamplesConstants.CREDENTIALS_STORE_PREF_FILE, OAuth.JSON_FACTORY);
      // setup authorization flow
      AuthorizationFlow flow =
          new AuthorizationFlow.Builder(
                  BearerToken.authorizationHeaderAccessMethod(),
                  OAuth.HTTP_TRANSPORT,
                  OAuth.JSON_FACTORY,
                  new GenericUrl(FoursquareConstants.TOKEN_SERVER_URL),
                  new ClientParametersAuthentication(
                      FoursquareConstants.CLIENT_ID, FoursquareConstants.CLIENT_SECRET),
                  FoursquareConstants.CLIENT_ID,
                  FoursquareConstants.AUTHORIZATION_CODE_SERVER_URL)
              .setScopes(Lists.<String>newArrayList())
              .setCredentialStore(credentialStore)
              .build();
      // setup UI controller
      AuthorizationDialogController controller =
          new DialogFragmentController(getFragmentManager()) {
            @Override
            public String getRedirectUri() throws IOException {
              return FoursquareConstants.REDIRECT_URL;
            }

            @Override
            public boolean isJavascriptEnabledForWebView() {
              return true;
            }
          };
      // instantiate an OAuthManager instance
      oauth = new OAuthManager(flow, controller);
    }
  /** 首先初始化每种属性的值的所有类型,用于后面的子类熵的计算时用 */
  private void initAttrValue() {
    ArrayList<String> tempValues;

    // 按照列的方式,从左往右找
    for (int j = 1; j < attrNum; j++) {
      // 从一列中的上往下开始寻找值
      tempValues = Lists.newArrayList();
      for (int i = 1; i < data.length; i++) {
        if (!tempValues.contains(data[i][j])) {
          // 如果这个属性的值没有添加过,则添加
          tempValues.add(data[i][j]);
        }
      }

      // 一列属性的值已经遍历完毕,复制到map属性表中
      attrValue.put(data[0][j], tempValues);
    }

    /*
     * for(Map.Entry entry : attrValue.entrySet()){
     * System.out.println("key:value " + entry.getKey() + ":" +
     * entry.getValue()); }
     */
  }
  /** 利用源数据构造决策树 */
  private void buildDecisionTree(
      AttrNode node,
      String parentAttrValue,
      String[][] remainData,
      ArrayList<String> remainAttr,
      boolean isID3) {
    node.setParentAttrValue(parentAttrValue);

    String attrName = "";
    double gainValue = 0;
    double tempValue = 0;

    // 如果只有1个属性则直接返回
    if (remainAttr.size() == 1) {
      System.out.println("attr null");
      return;
    }

    // 选择剩余属性中信息增益最大的作为下一个分类的属性
    for (int i = 0; i < remainAttr.size(); i++) {
      // 判断是否用ID3算法还是C4.5算法
      if (isID3) {
        // ID3算法采用的是按照信息增益的值来比
        tempValue = computeGain(remainData, remainAttr.get(i));
      } else {
        // C4.5算法进行了改进,用的是信息增益率来比,克服了用信息增益选择属性时偏向选择取值多的属性的不足
        tempValue = computeGainRatio(remainData, remainAttr.get(i));
      }

      if (tempValue > gainValue) {
        gainValue = tempValue;
        attrName = remainAttr.get(i);
      }
    }

    node.setAttrName(attrName);
    ArrayList<String> valueTypes = attrValue.get(attrName);
    remainAttr.remove(attrName);

    AttrNode[] childNode = new AttrNode[valueTypes.size()];
    String[][] rData;
    for (int i = 0; i < valueTypes.size(); i++) {
      // 移除非此值类型的数据
      rData = removeData(remainData, attrName, valueTypes.get(i));

      childNode[i] = new AttrNode();
      boolean sameClass = true;
      ArrayList<String> indexArray = Lists.newArrayList();
      for (int k = 1; k < rData.length; k++) {
        indexArray.add(rData[k][0]);
        // 判断是否为同一类的
        if (!rData[k][attrNames.length - 1].equals(rData[1][attrNames.length - 1])) {
          // 只要有1个不相等,就不是同类型的
          sameClass = false;
          break;
        }
      }

      if (!sameClass) {
        // 创建新的对象属性,对象的同个引用会出错
        ArrayList<String> rAttr = Lists.newArrayList();
        for (String str : remainAttr) {
          rAttr.add(str);
        }

        buildDecisionTree(childNode[i], valueTypes.get(i), rData, rAttr, isID3);
      } else {
        // 如果是同种类型,则直接为数据节点
        childNode[i].setParentAttrValue(valueTypes.get(i));
        childNode[i].setChildDataIndex(indexArray);
      }
    }
    node.setChildAttrNode(childNode);
  }
Exemple #13
0
  @SuppressWarnings("unchecked")
  @Override
  public TaskStatus run(TaskToolbox toolbox) throws Exception {
    final List<String> finalHadoopDependencyCoordinates =
        hadoopDependencyCoordinates != null
            ? hadoopDependencyCoordinates
            : toolbox.getConfig().getDefaultHadoopCoordinates();

    final DefaultTeslaAether aetherClient = Initialization.getAetherClient(extensionsConfig);

    final List<URL> extensionURLs = Lists.newArrayList();
    for (String coordinate : extensionsConfig.getCoordinates()) {
      final ClassLoader coordinateLoader =
          Initialization.getClassLoaderForCoordinates(aetherClient, coordinate);
      extensionURLs.addAll(Arrays.asList(((URLClassLoader) coordinateLoader).getURLs()));
    }

    final List<URL> nonHadoopURLs = Lists.newArrayList();
    nonHadoopURLs.addAll(
        Arrays.asList(((URLClassLoader) HadoopIndexTask.class.getClassLoader()).getURLs()));

    final List<URL> driverURLs = Lists.newArrayList();
    driverURLs.addAll(nonHadoopURLs);
    // put hadoop dependencies last to avoid jets3t & apache.httpcore version conflicts
    for (String hadoopDependencyCoordinate : finalHadoopDependencyCoordinates) {
      final ClassLoader hadoopLoader =
          Initialization.getClassLoaderForCoordinates(aetherClient, hadoopDependencyCoordinate);
      driverURLs.addAll(Arrays.asList(((URLClassLoader) hadoopLoader).getURLs()));
    }

    final URLClassLoader loader =
        new URLClassLoader(driverURLs.toArray(new URL[driverURLs.size()]), null);
    Thread.currentThread().setContextClassLoader(loader);

    final List<URL> jobUrls = Lists.newArrayList();
    jobUrls.addAll(nonHadoopURLs);
    jobUrls.addAll(extensionURLs);

    System.setProperty(
        "druid.hadoop.internal.classpath", Joiner.on(File.pathSeparator).join(jobUrls));
    boolean determineIntervals =
        !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();

    final Class<?> determineConfigurationMainClass =
        loader.loadClass(HadoopDetermineConfigInnerProcessing.class.getName());
    final Method determineConfigurationMainMethod =
        determineConfigurationMainClass.getMethod("runTask", String[].class);

    String[] determineConfigArgs =
        new String[] {
          toolbox.getObjectMapper().writeValueAsString(spec),
          toolbox.getConfig().getHadoopWorkingPath(),
          toolbox.getSegmentPusher().getPathForHadoop(getDataSource())
        };

    String config =
        (String) determineConfigurationMainMethod.invoke(null, new Object[] {determineConfigArgs});
    HadoopIngestionSpec indexerSchema =
        toolbox.getObjectMapper().readValue(config, HadoopIngestionSpec.class);

    // We should have a lock from before we started running only if interval was specified
    final String version;
    if (determineIntervals) {
      Interval interval =
          JodaUtils.umbrellaInterval(
              JodaUtils.condenseIntervals(
                  indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get()));
      TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval));
      version = lock.getVersion();
    } else {
      Iterable<TaskLock> locks = getTaskLocks(toolbox);
      final TaskLock myLock = Iterables.getOnlyElement(locks);
      version = myLock.getVersion();
    }
    log.info("Setting version to: %s", version);

    final Class<?> indexGeneratorMainClass =
        loader.loadClass(HadoopIndexGeneratorInnerProcessing.class.getName());
    final Method indexGeneratorMainMethod =
        indexGeneratorMainClass.getMethod("runTask", String[].class);
    String[] indexGeneratorArgs =
        new String[] {toolbox.getObjectMapper().writeValueAsString(indexerSchema), version};
    String segments =
        (String) indexGeneratorMainMethod.invoke(null, new Object[] {indexGeneratorArgs});

    if (segments != null) {

      List<DataSegment> publishedSegments =
          toolbox.getObjectMapper().readValue(segments, new TypeReference<List<DataSegment>>() {});

      toolbox.pushSegments(publishedSegments);
      return TaskStatus.success(getId());
    } else {
      return TaskStatus.failure(getId());
    }
  }