public static void testManualDiscretize() { Node x = new ContinuousVariable("X"); List<Node> nodes = Collections.singletonList(x); DataSet data = new ColtDataSet(9, nodes); data.setDouble(0, 0, 13.0); data.setDouble(1, 0, 1.2); data.setDouble(2, 0, 2.2); data.setDouble(3, 0, 4.5); data.setDouble(4, 0, 12.005); data.setDouble(5, 0, 5.5); data.setDouble(6, 0, 10.1); data.setDouble(7, 0, 7.5); data.setDouble(8, 0, 3.4); System.out.println(data); Discretizer discretizer = new Discretizer(data); discretizer.setVariablesCopied(true); discretizer.equalCounts(x, 3); DataSet discretized = discretizer.discretize(); System.out.println(discretized); assertEquals(discretized.getInt(0, 0), 2); assertEquals(discretized.getInt(1, 0), 0); assertEquals(discretized.getInt(2, 0), 0); assertEquals(discretized.getInt(3, 0), 1); assertEquals(discretized.getInt(4, 0), 2); assertEquals(discretized.getInt(5, 0), 1); assertEquals(discretized.getInt(6, 0), 2); assertEquals(discretized.getInt(7, 0), 1); assertEquals(discretized.getInt(8, 0), 0); }
// Causes a package cycle. public void testManualDiscretize2() { Graph graph = new Dag(GraphUtils.randomGraph(5, 0, 5, 3, 3, 3, false)); SemPm pm = new SemPm(graph); SemIm im = new SemIm(pm); DataSet data = im.simulateData(100, false); List<Node> nodes = data.getVariables(); Discretizer discretizer = new Discretizer(data); // discretizer.setVariablesCopied(true); discretizer.equalCounts(nodes.get(0), 3); discretizer.equalIntervals(nodes.get(1), 2); discretizer.equalCounts(nodes.get(2), 5); discretizer.equalIntervals(nodes.get(3), 8); discretizer.equalCounts(nodes.get(4), 4); DataSet discretized = discretizer.discretize(); System.out.println(discretized); assertEquals(2, maxInColumn(discretized, 0)); assertEquals(1, maxInColumn(discretized, 1)); assertEquals(4, maxInColumn(discretized, 2)); assertEquals(7, maxInColumn(discretized, 3)); assertEquals(3, maxInColumn(discretized, 4)); }
public DiscretFuzzy(String[][] inputDataArray, LinkedList attributes) { BigDecimal bigdecIntervals[][]; discretizer = new Discretizer( inputDataArray, attributes, InputData.getDiscType(), InputData.getNumIntervals() + 1); bigdecIntervals = getBigdecIntervals(inputDataArray, discretizer.getIntervals(), getVariablesToDiscretize()); fuzzyfier = new Fuzzyfier( inputDataArray, getVariablesToDiscretize(), discretizer.getIntervals(), bigdecIntervals, InputData.getDiscType()); fuzzyfiedData = fuzzyfier.getFuzzyfiedDataToPrint(); }
public static void testBreakpointCalculation() { double[] data = {13, 1.2, 2.2, 4.5, 12.005, 5.5, 10.1, 7.5, 3.4}; double[] breakpoints = Discretizer.getEqualFrequencyBreakPoints(data, 3); assertTrue(breakpoints.length == 2); assertEquals(4.5, breakpoints[0]); assertEquals(10.1, breakpoints[1]); Discretizer.Discretization dis = Discretizer.discretize(data, breakpoints, "after", Arrays.asList("0", "1", "2")); System.out.println(dis); breakpoints = Discretizer.getEqualFrequencyBreakPoints(data, 4); assertTrue(breakpoints.length == 3); assertEquals(3.4, breakpoints[0]); assertEquals(5.5, breakpoints[1]); assertEquals(10.1, breakpoints[2]); }
/** @param args the command line arguments */ public static void main(String[] args) { ParserParameters.doParse(args[0]); LogManager.initLogManager(); InstanceSet is = new InstanceSet(); try { is.readSet(Parameters.trainInputFile, true); } catch (Exception e) { LogManager.printErr(e.toString()); System.exit(1); } checkDataset(); Discretizer dis; String name = Parameters.algorithmName; dis = new FayyadDiscretizer(); dis.buildCutPoints(is); dis.applyDiscretization(Parameters.trainInputFile, Parameters.trainOutputFile); dis.applyDiscretization(Parameters.testInputFile, Parameters.testOutputFile); LogManager.closeLog(); }
public void testManualDiscretize3() { Graph graph = new Dag(GraphUtils.randomGraph(5, 0, 5, 3, 3, 3, false)); SemPm pm = new SemPm(graph); SemIm im = new SemIm(pm); DataSet data = im.simulateData(100, false); List<Node> nodes = data.getVariables(); Discretizer discretizer = new Discretizer(data); discretizer.setVariablesCopied(true); discretizer.setVariablesCopied(true); discretizer.equalCounts(nodes.get(0), 3); DataSet discretized = discretizer.discretize(); System.out.println(discretized); assertTrue(discretized.getVariable(0) instanceof DiscreteVariable); assertTrue(discretized.getVariable(1) instanceof ContinuousVariable); assertTrue(discretized.getVariable(2) instanceof ContinuousVariable); assertTrue(discretized.getVariable(3) instanceof ContinuousVariable); assertTrue(discretized.getVariable(4) instanceof ContinuousVariable); }
public void testContinuous() { final double[] data = {1, 2, 2.5, 3, 4, 5}; double[] cutoffs = new double[] {2.5, 3.2}; List<String> categories = Arrays.asList("lo", "med", "hi"); Discretizer.Discretization discretization = Discretizer.discretize(data, cutoffs, "after", categories); System.out.println(discretization); List<String> discretizedCategories = discretization.getVariable().getCategories(); int[] discretizedData = discretization.getData(); assertEquals("lo", discretizedCategories.get(discretizedData[0])); assertEquals("lo", discretizedCategories.get(discretizedData[1])); assertEquals("med", discretizedCategories.get(discretizedData[2])); assertEquals("med", discretizedCategories.get(discretizedData[3])); assertEquals("hi", discretizedCategories.get(discretizedData[4])); assertEquals("hi", discretizedCategories.get(discretizedData[5])); }
private void loadSociogramData(Collection<VertexRecord> records, SQLDumpReader sqlData) { logger.info("Loading sociogram data..."); Map<String, VertexRecord> map = sqlData.getFullAlterKeyMappping(records); TObjectIntHashMap<Vertex> rawDegrees = new TObjectIntHashMap<Vertex>(); for (Vertex v : proj.getVertices()) { rawDegrees.put(v, v.getNeighbours().size()); } int edgecnt = 0; int doublecnt = 0; int egoEdge = 0; Set<Vertex> notOkVertices = new HashSet<Vertex>(); Set<Vertex> okVertices = new HashSet<Vertex>(); DescriptiveStatistics notOkStats = new DescriptiveStatistics(); DescriptiveStatistics okStats = new DescriptiveStatistics(); DescriptiveStatistics numDistr = new DescriptiveStatistics(); DescriptiveStatistics numDistrNoZero = new DescriptiveStatistics(); DescriptiveStatistics sizeDistr = new DescriptiveStatistics(); TDoubleArrayList sizeValues = new TDoubleArrayList(); TDoubleArrayList kSizeValues = new TDoubleArrayList(); TDoubleArrayList numValues = new TDoubleArrayList(); TDoubleArrayList numValues2 = new TDoubleArrayList(); TDoubleArrayList kNumValues = new TDoubleArrayList(); for (VertexRecord record : records) { if (record.isEgo) { List<Set<String>> cliques = sqlData.getCliques(record); numDistr.addValue(cliques.size()); Vertex v = idMap.get(record.id); numValues.add(cliques.size()); kNumValues.add(v.getNeighbours().size()); if (!cliques.isEmpty()) numDistrNoZero.addValue(cliques.size()); for (Set<String> clique : cliques) { sizeDistr.addValue(clique.size()); sizeValues.add(clique.size()); kSizeValues.add(rawDegrees.get(projMap.get(v))); numValues2.add(cliques.size()); List<SocialSparseVertex> vertices = new ArrayList<SocialSparseVertex>(clique.size()); for (String alter : clique) { VertexRecord r = map.get(record.egoSQLId + alter); if (r != null) { SocialSparseVertex vertex = idMap.get(r.id); if (vertex != null) { vertices.add(vertex); } else { logger.warn("Vertex not found."); } } else { logger.warn("Record not found."); } } for (int i = 0; i < vertices.size(); i++) { for (int j = i + 1; j < vertices.size(); j++) { SampledVertexDecorator<SocialSparseVertex> vProj1 = projMap.get(vertices.get(i)); SampledVertexDecorator<SocialSparseVertex> vProj2 = projMap.get(vertices.get(j)); if (!vProj1.isSampled() && !vProj2.isSampled()) { if (Math.random() < 0.62) { SocialSparseEdge socialEdge = builder.addEdge(graph, vertices.get(i), vertices.get(j)); if (socialEdge != null) { projBuilder.addEdge(proj, vProj1, vProj2, socialEdge); edgecnt++; if (vProj1.isSampled() || vProj2.isSampled()) { egoEdge++; if (vProj1.isSampled()) notOkVertices.add(vProj1); else notOkVertices.add(vProj2); } } else { doublecnt++; if (vProj1.isSampled()) okVertices.add(vProj1); else if (vProj2.isSampled()) okVertices.add(vProj2); } } } } } } } } for (Vertex v : okVertices) okStats.addValue(rawDegrees.get(v)); for (Vertex v : notOkVertices) notOkStats.addValue(rawDegrees.get(v)); try { TDoubleDoubleHashMap hist = Histogram.createHistogram(okStats, new LinearDiscretizer(1), false); StatsWriter.writeHistogram( hist, "k", "n", "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_ok.txt"); TDoubleDoubleHashMap hist2 = Histogram.createHistogram(notOkStats, new LinearDiscretizer(1), false); StatsWriter.writeHistogram( hist2, "k", "n", "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_notok.txt"); TDoubleDoubleHashMap ratio = new TDoubleDoubleHashMap(); double[] keys = hist.keys(); for (double k : keys) { double val1 = hist2.get(k); double val2 = hist.get(k); ratio.put(k, val1 / (val2 + val1)); } StatsWriter.writeHistogram( ratio, "k", "p", "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_ratio.txt"); logger.info("Mean num of cliques: " + numDistrNoZero.getMean()); logger.info("Mean size: " + sizeDistr.getMean()); logger.info("Median num of cliques: " + StatUtils.percentile(numDistrNoZero.getValues(), 50)); logger.info("Median size: " + StatUtils.percentile(sizeDistr.getValues(), 50)); TDoubleDoubleHashMap histNum = Histogram.createHistogram( numDistrNoZero, FixedSampleSizeDiscretizer.create(numDistrNoZero.getValues(), 2, 20), true); Histogram.normalize(histNum); StatsWriter.writeHistogram( histNum, "num", "freq", "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/numCliques.txt"); TDoubleDoubleHashMap histSize = Histogram.createHistogram( sizeDistr, FixedSampleSizeDiscretizer.create(sizeDistr.getValues(), 2, 20), true); Histogram.normalize(histSize); StatsWriter.writeHistogram( histSize, "size", "freq", "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/numPersons.txt"); Discretizer discretizer = FixedSampleSizeDiscretizer.create(kSizeValues.toNativeArray(), 20, 20); TDoubleArrayList valuesX = new TDoubleArrayList(); for (int i = 0; i < kSizeValues.size(); i++) { valuesX.add(discretizer.discretize(kSizeValues.get(i))); } Correlations.writeToFile( Correlations.mean(valuesX.toNativeArray(), sizeValues.toNativeArray()), "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/size_k.txt", "k", "size"); discretizer = FixedSampleSizeDiscretizer.create(kNumValues.toNativeArray(), 20, 20); valuesX = new TDoubleArrayList(); for (int i = 0; i < kNumValues.size(); i++) { valuesX.add(discretizer.discretize(kNumValues.get(i))); } Correlations.writeToFile( Correlations.mean(valuesX.toNativeArray(), numValues.toNativeArray()), "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/num_k.txt", "k", "n"); Correlations.writeToFile( Correlations.mean(numValues2.toNativeArray(), sizeValues.toNativeArray()), "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/size_num.txt", "num", "size"); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } logger.info( String.format("Inserted %1$s edges, %2$s edges already present.", edgecnt, doublecnt)); logger.info(String.format("Inserted %1$s edges between at least one ego.", egoEdge)); }