@Override public int run(String[] args) throws Exception { addOption( SEQ_FILE_DIR_OPTION, "s", "The directory containing Sequence Files for the Clusters", true); addOption( OUTPUT_OPTION, "o", "Optional output directory. Default is to output to the console."); addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print"); addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print"); addOption( JSON_OPTION, "j", "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries"); addOption( POINTS_DIR_OPTION, "p", "The directory containing points sequence files mapping input vectors to their cluster. " + "If specified, then the program will output the points associated with a cluster"); addOption(DICTIONARY_OPTION, "d", "The dictionary file"); addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text"); if (parseArguments(args) == null) { return -1; } seqFileDir = new Path(getOption(SEQ_FILE_DIR_OPTION)); if (hasOption(POINTS_DIR_OPTION)) { pointsDir = new Path(getOption(POINTS_DIR_OPTION)); } outputFile = getOption(OUTPUT_OPTION); if (hasOption(SUBSTRING_OPTION)) { int sub = Integer.parseInt(getOption(SUBSTRING_OPTION)); if (sub >= 0) { subString = sub; } } if (hasOption(JSON_OPTION)) { useJSON = true; } termDictionary = getOption(DICTIONARY_OPTION); dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION); if (hasOption(NUM_WORDS_OPTION)) { numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION)); } init(); printClusters(null); return 0; }
public ClusterDumper(Path seqFileDir, Path pointsDir) throws IOException { this.seqFileDir = seqFileDir; this.pointsDir = pointsDir; init(); }