예제 #1
0
  private void createTestFile(String output, Vector bestWordsForm, StringBuffer header)
      throws FileNotFoundException, IOException {

    OutputStream fout = new FileOutputStream(output + "_test", false);
    OutputStream bout = new BufferedOutputStream(fout);
    OutputStreamWriter outputFile = new OutputStreamWriter(bout);

    StringBuffer tail = new StringBuffer();
    tail.append("\n");
    tail.append("\n");
    tail.append("@DATA");
    tail.append("\n");
    //      System.out.println("Print positve Test...");

    for (int i = 0; i < positiveTestExamples.length; i++) {
      VSMVector formTemp = positiveTestExamples[i];
      //          formTemp.squaredNormalization();
      tail.append("{");
      for (int j = 0; j < attributes.size(); j++) {
        VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
        if (elemForm != null) {
          tail.append(j);
          tail.append(" ");
          tail.append((int) elemForm.getWeight());
          tail.append(",");
        }
      }
      tail.append(attributes.size() + " S}");
      tail.append("\n");
    }

    //   System.out.println("Print negative Test...");
    for (int i = 0; i < negativeTestExamples.length; i++) {
      VSMVector formTemp = negativeTestExamples[i];
      //	      formTemp.squaredNormalization();
      tail.append("{");
      for (int j = 0; j < attributes.size(); j++) {
        VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
        if (elemForm != null) {
          tail.append(j);
          tail.append(" ");
          tail.append((int) elemForm.getWeight());
          tail.append(",");
        }
      }
      tail.append(attributes.size() + " NS}");
      tail.append("\n");
    }
    outputFile.write(header.toString());
    outputFile.flush();
    outputFile.write(tail.toString());
    outputFile.close();
  }
예제 #2
0
  protected VSMVector[] createVSM(
      File[] files, StopList stoplist, int[] indexes, boolean addToFeatures) throws SAXException {

    Vector<VSMVector> tempVSM = new Vector<VSMVector>();
    for (int i = 0; i < files.length && i < indexes.length; i++) {
      try {
        VSMVector vsm = new VSMVector(files[indexes[i]].toString(), isForm, stoplist);
        tempVSM.add(vsm);
        if (addToFeatures) {
          Iterator iterator1 = vsm.getElements();
          while (iterator1.hasNext()) {
            VSMElement elem = (VSMElement) iterator1.next();
            VSMElement value = (VSMElement) df.get(elem.getWord());
            if (value == null) {
              df.put(elem.getWord(), new VSMElement(elem.getWord(), 1));
            } else {
              df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1));
            }
          }
        }
        //			  System.out.println(vsm.toString());
      } catch (IOException ex) {

      }
    }
    VSMVector[] examples = new VSMVector[tempVSM.size()];
    tempVSM.toArray(examples);
    return examples;
  }
예제 #3
0
  protected VSMVector[] createVSM(File file, StopList stoplist) throws SAXException {
    Vector<VSMVector> tempVSM = new Vector<VSMVector>();
    try {
      BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));

      for (String line = reader.readLine(); line != null; line = reader.readLine()) {
        VSMVector vsm = new VSMVector(line, stoplist);
        tempVSM.add(vsm);
        Iterator iterator1 = vsm.getElements();
        while (iterator1.hasNext()) {
          VSMElement elem = (VSMElement) iterator1.next();
          VSMElement value = (VSMElement) df.get(elem.getWord());
          if (value == null) {
            df.put(elem.getWord(), new VSMElement(elem.getWord(), 1));
          } else {
            df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1));
          }
        }
        //			  System.out.println(vsm.toString());
      }
    } catch (IOException ex) {
      ex.printStackTrace();
    }

    VSMVector[] examples = new VSMVector[tempVSM.size()];
    tempVSM.toArray(examples);
    return examples;
  }
예제 #4
0
  public String[] centroid2Weka(String output) throws FileNotFoundException, IOException {
    //    FileOutputStream fout = new FileOutputStream(output,false);
    //    DataOutputStream dout = new DataOutputStream( fout );

    OutputStream fout = new FileOutputStream(output, false);
    OutputStream bout = new BufferedOutputStream(fout);
    OutputStreamWriter outputFile = new OutputStreamWriter(bout);

    StringBuffer header = new StringBuffer();
    header.append("@RELATION TSFC");
    header.append("\n");
    header.append("\n");
    StringBuffer tail = new StringBuffer();

    //    Iterator iterator1 = centroidForm.getElements();
    //    Vector bestWordsForm = new Vector();
    //    while (iterator1.hasNext()) {
    //     bestWordsForm.add(iterator1.next());
    //   }

    Vector bestWordsForm = new Vector(df.values());
    Collections.sort(bestWordsForm, new VSMElementComparator());
    //   for (int i = 0; i < 10; i++) {
    //	   VSMElement elem1 = (VSMElement)bestWordsForm.elementAt(i);
    //	   System.out.println(elem1.getWord());
    //	   System.out.println(elem1.getWeight());
    //   }
    //   System.out.println("Print header...");

    for (int i = 0; i <= numOfFeatures && i < bestWordsForm.size(); i++) {
      VSMElement elem = (VSMElement) bestWordsForm.elementAt(i);
      if (elem.getWeight() > minDF) {
        header.append("@ATTRIBUTE ");
        header.append(elem.getWord());
        attributes.add(elem.getWord());
        header.append(" REAL");
        header.append("\n");
      }
    }
    header.append("@ATTRIBUTE class {S,NS}");
    tail.append("\n");
    tail.append("\n");
    tail.append("@DATA");
    tail.append("\n");
    //   System.out.println("Print positive...");
    for (int i = 0; i < positiveExamples.length; i++) {
      VSMVector formTemp = positiveExamples[i];
      //      formTemp.squaredNormalization();
      tail.append("{");
      for (int j = 0; j < attributes.size(); j++) {
        VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
        if (elemForm != null) {
          tail.append(j);
          tail.append(" ");
          tail.append((int) elemForm.getWeight());
          tail.append(",");
        }
      }
      tail.append(attributes.size() + " S}");
      tail.append("\n");
    }

    //   System.out.println("Print negative...");
    for (int i = 0; i < negativeExamples.length; i++) {
      VSMVector formTemp = negativeExamples[i];
      //      formTemp.squaredNormalization();
      tail.append("{");
      for (int j = 0; j < attributes.size(); j++) {
        VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
        if (elemForm != null) {
          tail.append(j);
          tail.append(" ");
          tail.append((int) elemForm.getWeight());
          tail.append(",");
        }
      }
      tail.append(attributes.size() + " NS}");
      tail.append("\n");
    }

    outputFile.write(header.toString());
    outputFile.flush();
    outputFile.write(tail.toString());
    outputFile.close();
    if (positiveTestExamples != null) {
      createTestFile(output, bestWordsForm, header);
    }

    String[] atts = new String[attributes.size()];
    attributes.toArray(atts);
    return atts;
  }