Ejemplo n.º 1
0
  public static void testManualDiscretize() {
    Node x = new ContinuousVariable("X");
    List<Node> nodes = Collections.singletonList(x);
    DataSet data = new ColtDataSet(9, nodes);

    data.setDouble(0, 0, 13.0);
    data.setDouble(1, 0, 1.2);
    data.setDouble(2, 0, 2.2);
    data.setDouble(3, 0, 4.5);
    data.setDouble(4, 0, 12.005);
    data.setDouble(5, 0, 5.5);
    data.setDouble(6, 0, 10.1);
    data.setDouble(7, 0, 7.5);
    data.setDouble(8, 0, 3.4);

    System.out.println(data);

    Discretizer discretizer = new Discretizer(data);
    discretizer.setVariablesCopied(true);

    discretizer.equalCounts(x, 3);
    DataSet discretized = discretizer.discretize();

    System.out.println(discretized);

    assertEquals(discretized.getInt(0, 0), 2);
    assertEquals(discretized.getInt(1, 0), 0);
    assertEquals(discretized.getInt(2, 0), 0);
    assertEquals(discretized.getInt(3, 0), 1);
    assertEquals(discretized.getInt(4, 0), 2);
    assertEquals(discretized.getInt(5, 0), 1);
    assertEquals(discretized.getInt(6, 0), 2);
    assertEquals(discretized.getInt(7, 0), 1);
    assertEquals(discretized.getInt(8, 0), 0);
  }
Ejemplo n.º 2
0
  // Causes a package cycle.
  public void testManualDiscretize2() {
    Graph graph = new Dag(GraphUtils.randomGraph(5, 0, 5, 3, 3, 3, false));
    SemPm pm = new SemPm(graph);
    SemIm im = new SemIm(pm);
    DataSet data = im.simulateData(100, false);

    List<Node> nodes = data.getVariables();

    Discretizer discretizer = new Discretizer(data);
    //        discretizer.setVariablesCopied(true);

    discretizer.equalCounts(nodes.get(0), 3);
    discretizer.equalIntervals(nodes.get(1), 2);
    discretizer.equalCounts(nodes.get(2), 5);
    discretizer.equalIntervals(nodes.get(3), 8);
    discretizer.equalCounts(nodes.get(4), 4);

    DataSet discretized = discretizer.discretize();

    System.out.println(discretized);

    assertEquals(2, maxInColumn(discretized, 0));
    assertEquals(1, maxInColumn(discretized, 1));
    assertEquals(4, maxInColumn(discretized, 2));
    assertEquals(7, maxInColumn(discretized, 3));
    assertEquals(3, maxInColumn(discretized, 4));
  }
Ejemplo n.º 3
0
  public DiscretFuzzy(String[][] inputDataArray, LinkedList attributes) {
    BigDecimal bigdecIntervals[][];

    discretizer =
        new Discretizer(
            inputDataArray, attributes, InputData.getDiscType(), InputData.getNumIntervals() + 1);
    bigdecIntervals =
        getBigdecIntervals(inputDataArray, discretizer.getIntervals(), getVariablesToDiscretize());

    fuzzyfier =
        new Fuzzyfier(
            inputDataArray,
            getVariablesToDiscretize(),
            discretizer.getIntervals(),
            bigdecIntervals,
            InputData.getDiscType());
    fuzzyfiedData = fuzzyfier.getFuzzyfiedDataToPrint();
  }
Ejemplo n.º 4
0
  public static void testBreakpointCalculation() {
    double[] data = {13, 1.2, 2.2, 4.5, 12.005, 5.5, 10.1, 7.5, 3.4};
    double[] breakpoints = Discretizer.getEqualFrequencyBreakPoints(data, 3);

    assertTrue(breakpoints.length == 2);
    assertEquals(4.5, breakpoints[0]);
    assertEquals(10.1, breakpoints[1]);

    Discretizer.Discretization dis =
        Discretizer.discretize(data, breakpoints, "after", Arrays.asList("0", "1", "2"));
    System.out.println(dis);

    breakpoints = Discretizer.getEqualFrequencyBreakPoints(data, 4);
    assertTrue(breakpoints.length == 3);

    assertEquals(3.4, breakpoints[0]);
    assertEquals(5.5, breakpoints[1]);
    assertEquals(10.1, breakpoints[2]);
  }
Ejemplo n.º 5
0
  /** @param args the command line arguments */
  public static void main(String[] args) {
    ParserParameters.doParse(args[0]);
    LogManager.initLogManager();

    InstanceSet is = new InstanceSet();
    try {
      is.readSet(Parameters.trainInputFile, true);
    } catch (Exception e) {
      LogManager.printErr(e.toString());
      System.exit(1);
    }
    checkDataset();

    Discretizer dis;
    String name = Parameters.algorithmName;
    dis = new FayyadDiscretizer();
    dis.buildCutPoints(is);
    dis.applyDiscretization(Parameters.trainInputFile, Parameters.trainOutputFile);
    dis.applyDiscretization(Parameters.testInputFile, Parameters.testOutputFile);
    LogManager.closeLog();
  }
Ejemplo n.º 6
0
  public void testManualDiscretize3() {
    Graph graph = new Dag(GraphUtils.randomGraph(5, 0, 5, 3, 3, 3, false));
    SemPm pm = new SemPm(graph);
    SemIm im = new SemIm(pm);
    DataSet data = im.simulateData(100, false);

    List<Node> nodes = data.getVariables();

    Discretizer discretizer = new Discretizer(data);
    discretizer.setVariablesCopied(true);

    discretizer.setVariablesCopied(true);
    discretizer.equalCounts(nodes.get(0), 3);

    DataSet discretized = discretizer.discretize();

    System.out.println(discretized);

    assertTrue(discretized.getVariable(0) instanceof DiscreteVariable);
    assertTrue(discretized.getVariable(1) instanceof ContinuousVariable);
    assertTrue(discretized.getVariable(2) instanceof ContinuousVariable);
    assertTrue(discretized.getVariable(3) instanceof ContinuousVariable);
    assertTrue(discretized.getVariable(4) instanceof ContinuousVariable);
  }
Ejemplo n.º 7
0
  public void testContinuous() {
    final double[] data = {1, 2, 2.5, 3, 4, 5};

    double[] cutoffs = new double[] {2.5, 3.2};
    List<String> categories = Arrays.asList("lo", "med", "hi");

    Discretizer.Discretization discretization =
        Discretizer.discretize(data, cutoffs, "after", categories);

    System.out.println(discretization);

    List<String> discretizedCategories = discretization.getVariable().getCategories();
    int[] discretizedData = discretization.getData();

    assertEquals("lo", discretizedCategories.get(discretizedData[0]));
    assertEquals("lo", discretizedCategories.get(discretizedData[1]));
    assertEquals("med", discretizedCategories.get(discretizedData[2]));
    assertEquals("med", discretizedCategories.get(discretizedData[3]));
    assertEquals("hi", discretizedCategories.get(discretizedData[4]));
    assertEquals("hi", discretizedCategories.get(discretizedData[5]));
  }
Ejemplo n.º 8
0
  private void loadSociogramData(Collection<VertexRecord> records, SQLDumpReader sqlData) {
    logger.info("Loading sociogram data...");
    Map<String, VertexRecord> map = sqlData.getFullAlterKeyMappping(records);

    TObjectIntHashMap<Vertex> rawDegrees = new TObjectIntHashMap<Vertex>();
    for (Vertex v : proj.getVertices()) {
      rawDegrees.put(v, v.getNeighbours().size());
    }

    int edgecnt = 0;
    int doublecnt = 0;
    int egoEdge = 0;

    Set<Vertex> notOkVertices = new HashSet<Vertex>();
    Set<Vertex> okVertices = new HashSet<Vertex>();
    DescriptiveStatistics notOkStats = new DescriptiveStatistics();
    DescriptiveStatistics okStats = new DescriptiveStatistics();

    DescriptiveStatistics numDistr = new DescriptiveStatistics();
    DescriptiveStatistics numDistrNoZero = new DescriptiveStatistics();
    DescriptiveStatistics sizeDistr = new DescriptiveStatistics();

    TDoubleArrayList sizeValues = new TDoubleArrayList();
    TDoubleArrayList kSizeValues = new TDoubleArrayList();
    TDoubleArrayList numValues = new TDoubleArrayList();
    TDoubleArrayList numValues2 = new TDoubleArrayList();
    TDoubleArrayList kNumValues = new TDoubleArrayList();

    for (VertexRecord record : records) {
      if (record.isEgo) {
        List<Set<String>> cliques = sqlData.getCliques(record);
        numDistr.addValue(cliques.size());

        Vertex v = idMap.get(record.id);
        numValues.add(cliques.size());
        kNumValues.add(v.getNeighbours().size());

        if (!cliques.isEmpty()) numDistrNoZero.addValue(cliques.size());

        for (Set<String> clique : cliques) {
          sizeDistr.addValue(clique.size());
          sizeValues.add(clique.size());
          kSizeValues.add(rawDegrees.get(projMap.get(v)));
          numValues2.add(cliques.size());
          List<SocialSparseVertex> vertices = new ArrayList<SocialSparseVertex>(clique.size());
          for (String alter : clique) {
            VertexRecord r = map.get(record.egoSQLId + alter);
            if (r != null) {
              SocialSparseVertex vertex = idMap.get(r.id);
              if (vertex != null) {
                vertices.add(vertex);
              } else {
                logger.warn("Vertex not found.");
              }
            } else {
              logger.warn("Record not found.");
            }
          }

          for (int i = 0; i < vertices.size(); i++) {
            for (int j = i + 1; j < vertices.size(); j++) {
              SampledVertexDecorator<SocialSparseVertex> vProj1 = projMap.get(vertices.get(i));
              SampledVertexDecorator<SocialSparseVertex> vProj2 = projMap.get(vertices.get(j));
              if (!vProj1.isSampled() && !vProj2.isSampled()) {

                if (Math.random() < 0.62) {
                  SocialSparseEdge socialEdge =
                      builder.addEdge(graph, vertices.get(i), vertices.get(j));
                  if (socialEdge != null) {
                    projBuilder.addEdge(proj, vProj1, vProj2, socialEdge);
                    edgecnt++;

                    if (vProj1.isSampled() || vProj2.isSampled()) {
                      egoEdge++;
                      if (vProj1.isSampled()) notOkVertices.add(vProj1);
                      else notOkVertices.add(vProj2);
                    }

                  } else {
                    doublecnt++;
                    if (vProj1.isSampled()) okVertices.add(vProj1);
                    else if (vProj2.isSampled()) okVertices.add(vProj2);
                  }
                }
              }
            }
          }
        }
      }
    }

    for (Vertex v : okVertices) okStats.addValue(rawDegrees.get(v));

    for (Vertex v : notOkVertices) notOkStats.addValue(rawDegrees.get(v));
    try {

      TDoubleDoubleHashMap hist =
          Histogram.createHistogram(okStats, new LinearDiscretizer(1), false);
      StatsWriter.writeHistogram(
          hist,
          "k",
          "n",
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_ok.txt");

      TDoubleDoubleHashMap hist2 =
          Histogram.createHistogram(notOkStats, new LinearDiscretizer(1), false);
      StatsWriter.writeHistogram(
          hist2,
          "k",
          "n",
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_notok.txt");

      TDoubleDoubleHashMap ratio = new TDoubleDoubleHashMap();
      double[] keys = hist.keys();
      for (double k : keys) {
        double val1 = hist2.get(k);
        double val2 = hist.get(k);

        ratio.put(k, val1 / (val2 + val1));
      }
      StatsWriter.writeHistogram(
          ratio,
          "k",
          "p",
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/k_ratio.txt");

      logger.info("Mean num of cliques: " + numDistrNoZero.getMean());
      logger.info("Mean size: " + sizeDistr.getMean());
      logger.info("Median num of cliques: " + StatUtils.percentile(numDistrNoZero.getValues(), 50));
      logger.info("Median size: " + StatUtils.percentile(sizeDistr.getValues(), 50));

      TDoubleDoubleHashMap histNum =
          Histogram.createHistogram(
              numDistrNoZero,
              FixedSampleSizeDiscretizer.create(numDistrNoZero.getValues(), 2, 20),
              true);
      Histogram.normalize(histNum);
      StatsWriter.writeHistogram(
          histNum,
          "num",
          "freq",
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/numCliques.txt");

      TDoubleDoubleHashMap histSize =
          Histogram.createHistogram(
              sizeDistr, FixedSampleSizeDiscretizer.create(sizeDistr.getValues(), 2, 20), true);
      Histogram.normalize(histSize);
      StatsWriter.writeHistogram(
          histSize,
          "size",
          "freq",
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/numPersons.txt");

      Discretizer discretizer =
          FixedSampleSizeDiscretizer.create(kSizeValues.toNativeArray(), 20, 20);
      TDoubleArrayList valuesX = new TDoubleArrayList();
      for (int i = 0; i < kSizeValues.size(); i++) {
        valuesX.add(discretizer.discretize(kSizeValues.get(i)));
      }

      Correlations.writeToFile(
          Correlations.mean(valuesX.toNativeArray(), sizeValues.toNativeArray()),
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/size_k.txt",
          "k",
          "size");

      discretizer = FixedSampleSizeDiscretizer.create(kNumValues.toNativeArray(), 20, 20);
      valuesX = new TDoubleArrayList();
      for (int i = 0; i < kNumValues.size(); i++) {
        valuesX.add(discretizer.discretize(kNumValues.get(i)));
      }

      Correlations.writeToFile(
          Correlations.mean(valuesX.toNativeArray(), numValues.toNativeArray()),
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/num_k.txt",
          "k",
          "n");

      Correlations.writeToFile(
          Correlations.mean(numValues2.toNativeArray(), sizeValues.toNativeArray()),
          "/Users/jillenberger/Work/socialnets/data/ivt2009/11-2011/augmented/size_num.txt",
          "num",
          "size");
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    logger.info(
        String.format("Inserted %1$s edges, %2$s edges already present.", edgecnt, doublecnt));
    logger.info(String.format("Inserted %1$s edges between at least one ego.", egoEdge));
  }