/** * 属性划分完毕,进行数据的移除 * * @param srcData 源数据 * @param attrName 划分的属性名称 * @param valueType 属性的值类型 */ private String[][] removeData(String[][] srcData, String attrName, String valueType) { String[][] desDataArray; ArrayList<String[]> desData = Lists.newArrayList(); // 待删除数据 ArrayList<String[]> selectData = Lists.newArrayList(); selectData.add(attrNames); // 数组数据转化到列表中,方便移除 for (int i = 0; i < srcData.length; i++) { desData.add(srcData[i]); } // 还是从左往右一列列的查找 for (int j = 1; j < attrNames.length; j++) { if (attrNames[j].equals(attrName)) { for (int i = 1; i < desData.size(); i++) { if (desData.get(i)[j].equals(valueType)) { // 如果匹配这个数据,则移除其他的数据 selectData.add(desData.get(i)); } } } } desDataArray = new String[selectData.size()][]; selectData.toArray(desDataArray); return desDataArray; }
/** * get a string representation of what is in the to IndexQueue. * * @return a list of tasks in the index queue. */ public Collection<String> getToIndexQueue() { ArrayList<String> newArrayList = Lists.newArrayList(); for (Future<String> f : this.indexQueue) { newArrayList.add(f.toString()); } return newArrayList; }
/** * This method should use the configuration settings to maintain the cache of the content manager * object. * * @param versionJustIndexed - the version we just indexed. */ public synchronized void cleanupCache(final String versionJustIndexed) { int maxCacheSize = Integer.parseInt(properties.getProperty(Constants.MAX_VERSIONS_TO_CACHE)); // clean up task queue for (Future<?> future : this.indexQueue) { if (future.isDone() || future.isCancelled()) { this.indexQueue.remove(future); } } log.info("Index job queue currently of size (" + this.indexQueue.size() + ")"); // first check if our cache is bigger than we want it to be if (contentManager.getCachedVersionList().size() > maxCacheSize) { log.info( "Cache is too full (" + contentManager.getCachedVersionList().size() + ") finding and deleting old versions"); // Now we want to decide which versions we can safely get rid of. List<String> allCachedVersions = Lists.newArrayList(contentManager.getCachedVersionList()); // sort them so they are in ascending order with the oldest version first. Collections.sort( allCachedVersions, new Comparator<String>() { @Override public int compare(final String arg0, final String arg1) { return contentManager.compareTo(arg0, arg1); } }); for (String version : allCachedVersions) { // we want to stop when we have deleted enough. if (contentManager.getCachedVersionList().size() <= maxCacheSize) { log.info("Cache clear complete"); break; } // check we are not deleting the version that is currently // in use before we delete it. if (!isVersionInUse(version) && !versionJustIndexed.equals(version)) { log.info("Requesting to delete the content at version " + version + " from the cache."); contentManager.clearCache(version); } } // we couldn't free up enough space if (contentManager.getCachedVersionList().size() > maxCacheSize) { log.warn( "Warning unable to reduce cache to target size: current cache size is " + contentManager.getCachedVersionList().size()); } } else { log.info( "Not evicting cache as we have enough space: current cache size is " + contentManager.getCachedVersionList().size() + "."); } }
/** * 开始构建决策树 * * @param isID3 是否采用ID3算法构架决策树 */ public void startBuildingTree(boolean isID3) { readDataFile(); initAttrValue(); ArrayList<String> remainAttr = Lists.newArrayList(); // 添加属性,除了最后一个类标号属性 for (int i = 1; i < attrNames.length - 1; i++) { remainAttr.add(attrNames[i]); } AttrNode rootNode = new AttrNode(); buildDecisionTree(rootNode, "", data, remainAttr, isID3); showDecisionTree(rootNode, 1); }
@Override public void importEntities(List<Entity> entities) { String defaultNamespace = NamespaceManager.get(); List<Entity> copies = Lists.newArrayList(); for (Entity entity : entities) { // Entities imported might not have the correct appId, we need to make a copy copies.add(createEntityCopy(entity)); } entitiesService.putAsync(copies); NamespaceManager.set(defaultNamespace); }
private boolean addEntry( final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) { boolean success = false; // lock on feed, make sure we are not updating the same feed twice at // the same time String key1 = StringUtils.trimToEmpty("" + feed.getId()); // lock on content, make sure we are not updating the same entry // twice at the same time FeedEntryContent content = entry.getContent(); String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle())); Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator(); Lock lock1 = iterator.next(); Lock lock2 = iterator.next(); boolean locked1 = false; boolean locked2 = false; try { locked1 = lock1.tryLock(1, TimeUnit.MINUTES); locked2 = lock2.tryLock(1, TimeUnit.MINUTES); if (locked1 && locked2) { feedUpdateService.updateEntry(feed, entry); List<User> users = Lists.newArrayList(); for (FeedSubscription sub : subscriptions) { users.add(sub.getUser()); } cache.invalidateUnreadCount(subscriptions.toArray(new FeedSubscription[0])); cache.invalidateUserRootCategory(users.toArray(new User[0])); metricsBean.entryInserted(); success = true; } else { log.error("lock timeout for " + feed.getUrl() + " - " + key1); } } catch (InterruptedException e) { log.error( "interrupted while waiting for lock for " + feed.getUrl() + " : " + e.getMessage(), e); } finally { if (locked1) { lock1.unlock(); } if (locked2) { lock2.unlock(); } } return success; }
private void testBase(String[] ARGS) throws IOException, GeneralSecurityException { // Run the pipeline. VariantSimilarity.main(ARGS); // Download the pipeline results. List<GraphResult> results = Lists.newArrayList(); for (GcsPath path : helper.gcsUtil.expand(GcsPath.fromUri(outputPrefix + "*"))) { BufferedReader reader = helper.openOutput(path.toString()); for (String line = reader.readLine(); line != null; line = reader.readLine()) { results.add(GraphResult.fromString(line)); } } // Check the pipeline results. assertEquals(helper.PLATINUM_GENOMES_NUMBER_OF_SAMPLES, results.size()); assertThat(results, CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT))); }
public Feed subscribe(User user, String url, String title, FeedCategory category) { final String pubUrl = applicationSettingsService.get().getPublicUrl(); if (StringUtils.isBlank(pubUrl)) { throw new FeedSubscriptionException("Public URL of this CommaFeed instance is not set"); } if (url.startsWith(pubUrl)) { throw new FeedSubscriptionException( "Could not subscribe to a feed from this CommaFeed instance"); } Feed feed = feedService.findOrCreate(url); FeedSubscription sub = feedSubscriptionDAO.findByFeed(user, feed); boolean newSubscription = false; if (sub == null) { sub = new FeedSubscription(); sub.setFeed(feed); sub.setUser(user); newSubscription = true; } sub.setCategory(category); sub.setPosition(0); sub.setTitle(FeedUtils.truncate(title, 128)); feedSubscriptionDAO.saveOrUpdate(sub); if (newSubscription) { List<FeedEntryStatus> statuses = Lists.newArrayList(); List<FeedEntry> allEntries = feedEntryDAO.findByFeed(feed, 0, 10); for (FeedEntry entry : allEntries) { FeedEntryStatus status = new FeedEntryStatus(); status.setEntry(entry); status.setRead(false); status.setSubscription(sub); statuses.add(status); } feedEntryStatusDAO.saveOrUpdate(statuses); } taskGiver.add(feed); return feed; }
@Override public void run() { boolean ok = true; Feed feed = context.getFeed(); List<FeedEntry> entries = context.getEntries(); if (entries.isEmpty() == false) { List<String> lastEntries = cache.getLastEntries(feed); List<String> currentEntries = Lists.newArrayList(); List<FeedSubscription> subscriptions = null; for (FeedEntry entry : entries) { String cacheKey = cache.buildUniqueEntryKey(feed, entry); if (!lastEntries.contains(cacheKey)) { log.debug("cache miss for {}", entry.getUrl()); if (subscriptions == null) { subscriptions = feedSubscriptionDAO.findByFeed(feed); } ok &= addEntry(feed, entry, subscriptions); metricsBean.entryCacheMiss(); } else { log.debug("cache hit for {}", entry.getUrl()); metricsBean.entryCacheHit(); } currentEntries.add(cacheKey); } cache.setLastEntries(feed, currentEntries); } if (applicationSettingsService.get().isPubsubhubbub()) { handlePubSub(feed); } if (!ok) { // requeue asap feed.setDisabledUntil(new Date(0)); } metricsBean.feedUpdated(); taskGiver.giveBack(feed); }
@Override public void onActivityCreated(Bundle savedInstanceState) { super.onActivityCreated(savedInstanceState); // setup credential store SharedPreferencesCredentialStore credentialStore = new SharedPreferencesCredentialStore( getActivity(), SamplesConstants.CREDENTIALS_STORE_PREF_FILE, OAuth.JSON_FACTORY); // setup authorization flow AuthorizationFlow flow = new AuthorizationFlow.Builder( BearerToken.authorizationHeaderAccessMethod(), OAuth.HTTP_TRANSPORT, OAuth.JSON_FACTORY, new GenericUrl(FoursquareConstants.TOKEN_SERVER_URL), new ClientParametersAuthentication( FoursquareConstants.CLIENT_ID, FoursquareConstants.CLIENT_SECRET), FoursquareConstants.CLIENT_ID, FoursquareConstants.AUTHORIZATION_CODE_SERVER_URL) .setScopes(Lists.<String>newArrayList()) .setCredentialStore(credentialStore) .build(); // setup UI controller AuthorizationDialogController controller = new DialogFragmentController(getFragmentManager()) { @Override public String getRedirectUri() throws IOException { return FoursquareConstants.REDIRECT_URL; } @Override public boolean isJavascriptEnabledForWebView() { return true; } }; // instantiate an OAuthManager instance oauth = new OAuthManager(flow, controller); }
/** 首先初始化每种属性的值的所有类型,用于后面的子类熵的计算时用 */ private void initAttrValue() { ArrayList<String> tempValues; // 按照列的方式,从左往右找 for (int j = 1; j < attrNum; j++) { // 从一列中的上往下开始寻找值 tempValues = Lists.newArrayList(); for (int i = 1; i < data.length; i++) { if (!tempValues.contains(data[i][j])) { // 如果这个属性的值没有添加过,则添加 tempValues.add(data[i][j]); } } // 一列属性的值已经遍历完毕,复制到map属性表中 attrValue.put(data[0][j], tempValues); } /* * for(Map.Entry entry : attrValue.entrySet()){ * System.out.println("key:value " + entry.getKey() + ":" + * entry.getValue()); } */ }
/** 利用源数据构造决策树 */ private void buildDecisionTree( AttrNode node, String parentAttrValue, String[][] remainData, ArrayList<String> remainAttr, boolean isID3) { node.setParentAttrValue(parentAttrValue); String attrName = ""; double gainValue = 0; double tempValue = 0; // 如果只有1个属性则直接返回 if (remainAttr.size() == 1) { System.out.println("attr null"); return; } // 选择剩余属性中信息增益最大的作为下一个分类的属性 for (int i = 0; i < remainAttr.size(); i++) { // 判断是否用ID3算法还是C4.5算法 if (isID3) { // ID3算法采用的是按照信息增益的值来比 tempValue = computeGain(remainData, remainAttr.get(i)); } else { // C4.5算法进行了改进,用的是信息增益率来比,克服了用信息增益选择属性时偏向选择取值多的属性的不足 tempValue = computeGainRatio(remainData, remainAttr.get(i)); } if (tempValue > gainValue) { gainValue = tempValue; attrName = remainAttr.get(i); } } node.setAttrName(attrName); ArrayList<String> valueTypes = attrValue.get(attrName); remainAttr.remove(attrName); AttrNode[] childNode = new AttrNode[valueTypes.size()]; String[][] rData; for (int i = 0; i < valueTypes.size(); i++) { // 移除非此值类型的数据 rData = removeData(remainData, attrName, valueTypes.get(i)); childNode[i] = new AttrNode(); boolean sameClass = true; ArrayList<String> indexArray = Lists.newArrayList(); for (int k = 1; k < rData.length; k++) { indexArray.add(rData[k][0]); // 判断是否为同一类的 if (!rData[k][attrNames.length - 1].equals(rData[1][attrNames.length - 1])) { // 只要有1个不相等,就不是同类型的 sameClass = false; break; } } if (!sameClass) { // 创建新的对象属性,对象的同个引用会出错 ArrayList<String> rAttr = Lists.newArrayList(); for (String str : remainAttr) { rAttr.add(str); } buildDecisionTree(childNode[i], valueTypes.get(i), rData, rAttr, isID3); } else { // 如果是同种类型,则直接为数据节点 childNode[i].setParentAttrValue(valueTypes.get(i)); childNode[i].setChildDataIndex(indexArray); } } node.setChildAttrNode(childNode); }
@SuppressWarnings("unchecked") @Override public TaskStatus run(TaskToolbox toolbox) throws Exception { final List<String> finalHadoopDependencyCoordinates = hadoopDependencyCoordinates != null ? hadoopDependencyCoordinates : toolbox.getConfig().getDefaultHadoopCoordinates(); final DefaultTeslaAether aetherClient = Initialization.getAetherClient(extensionsConfig); final List<URL> extensionURLs = Lists.newArrayList(); for (String coordinate : extensionsConfig.getCoordinates()) { final ClassLoader coordinateLoader = Initialization.getClassLoaderForCoordinates(aetherClient, coordinate); extensionURLs.addAll(Arrays.asList(((URLClassLoader) coordinateLoader).getURLs())); } final List<URL> nonHadoopURLs = Lists.newArrayList(); nonHadoopURLs.addAll( Arrays.asList(((URLClassLoader) HadoopIndexTask.class.getClassLoader()).getURLs())); final List<URL> driverURLs = Lists.newArrayList(); driverURLs.addAll(nonHadoopURLs); // put hadoop dependencies last to avoid jets3t & apache.httpcore version conflicts for (String hadoopDependencyCoordinate : finalHadoopDependencyCoordinates) { final ClassLoader hadoopLoader = Initialization.getClassLoaderForCoordinates(aetherClient, hadoopDependencyCoordinate); driverURLs.addAll(Arrays.asList(((URLClassLoader) hadoopLoader).getURLs())); } final URLClassLoader loader = new URLClassLoader(driverURLs.toArray(new URL[driverURLs.size()]), null); Thread.currentThread().setContextClassLoader(loader); final List<URL> jobUrls = Lists.newArrayList(); jobUrls.addAll(nonHadoopURLs); jobUrls.addAll(extensionURLs); System.setProperty( "druid.hadoop.internal.classpath", Joiner.on(File.pathSeparator).join(jobUrls)); boolean determineIntervals = !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent(); final Class<?> determineConfigurationMainClass = loader.loadClass(HadoopDetermineConfigInnerProcessing.class.getName()); final Method determineConfigurationMainMethod = determineConfigurationMainClass.getMethod("runTask", String[].class); String[] determineConfigArgs = new String[] { toolbox.getObjectMapper().writeValueAsString(spec), toolbox.getConfig().getHadoopWorkingPath(), toolbox.getSegmentPusher().getPathForHadoop(getDataSource()) }; String config = (String) determineConfigurationMainMethod.invoke(null, new Object[] {determineConfigArgs}); HadoopIngestionSpec indexerSchema = toolbox.getObjectMapper().readValue(config, HadoopIngestionSpec.class); // We should have a lock from before we started running only if interval was specified final String version; if (determineIntervals) { Interval interval = JodaUtils.umbrellaInterval( JodaUtils.condenseIntervals( indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get())); TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval)); version = lock.getVersion(); } else { Iterable<TaskLock> locks = getTaskLocks(toolbox); final TaskLock myLock = Iterables.getOnlyElement(locks); version = myLock.getVersion(); } log.info("Setting version to: %s", version); final Class<?> indexGeneratorMainClass = loader.loadClass(HadoopIndexGeneratorInnerProcessing.class.getName()); final Method indexGeneratorMainMethod = indexGeneratorMainClass.getMethod("runTask", String[].class); String[] indexGeneratorArgs = new String[] {toolbox.getObjectMapper().writeValueAsString(indexerSchema), version}; String segments = (String) indexGeneratorMainMethod.invoke(null, new Object[] {indexGeneratorArgs}); if (segments != null) { List<DataSegment> publishedSegments = toolbox.getObjectMapper().readValue(segments, new TypeReference<List<DataSegment>>() {}); toolbox.pushSegments(publishedSegments); return TaskStatus.success(getId()); } else { return TaskStatus.failure(getId()); } }