private void createTestFile(String output, Vector bestWordsForm, StringBuffer header) throws FileNotFoundException, IOException { OutputStream fout = new FileOutputStream(output + "_test", false); OutputStream bout = new BufferedOutputStream(fout); OutputStreamWriter outputFile = new OutputStreamWriter(bout); StringBuffer tail = new StringBuffer(); tail.append("\n"); tail.append("\n"); tail.append("@DATA"); tail.append("\n"); // System.out.println("Print positve Test..."); for (int i = 0; i < positiveTestExamples.length; i++) { VSMVector formTemp = positiveTestExamples[i]; // formTemp.squaredNormalization(); tail.append("{"); for (int j = 0; j < attributes.size(); j++) { VSMElement elemForm = formTemp.getElement(attributes.elementAt(j)); if (elemForm != null) { tail.append(j); tail.append(" "); tail.append((int) elemForm.getWeight()); tail.append(","); } } tail.append(attributes.size() + " S}"); tail.append("\n"); } // System.out.println("Print negative Test..."); for (int i = 0; i < negativeTestExamples.length; i++) { VSMVector formTemp = negativeTestExamples[i]; // formTemp.squaredNormalization(); tail.append("{"); for (int j = 0; j < attributes.size(); j++) { VSMElement elemForm = formTemp.getElement(attributes.elementAt(j)); if (elemForm != null) { tail.append(j); tail.append(" "); tail.append((int) elemForm.getWeight()); tail.append(","); } } tail.append(attributes.size() + " NS}"); tail.append("\n"); } outputFile.write(header.toString()); outputFile.flush(); outputFile.write(tail.toString()); outputFile.close(); }
protected VSMVector[] createVSM( File[] files, StopList stoplist, int[] indexes, boolean addToFeatures) throws SAXException { Vector<VSMVector> tempVSM = new Vector<VSMVector>(); for (int i = 0; i < files.length && i < indexes.length; i++) { try { VSMVector vsm = new VSMVector(files[indexes[i]].toString(), isForm, stoplist); tempVSM.add(vsm); if (addToFeatures) { Iterator iterator1 = vsm.getElements(); while (iterator1.hasNext()) { VSMElement elem = (VSMElement) iterator1.next(); VSMElement value = (VSMElement) df.get(elem.getWord()); if (value == null) { df.put(elem.getWord(), new VSMElement(elem.getWord(), 1)); } else { df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1)); } } } // System.out.println(vsm.toString()); } catch (IOException ex) { } } VSMVector[] examples = new VSMVector[tempVSM.size()]; tempVSM.toArray(examples); return examples; }
protected VSMVector[] createVSM(File file, StopList stoplist) throws SAXException { Vector<VSMVector> tempVSM = new Vector<VSMVector>(); try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); for (String line = reader.readLine(); line != null; line = reader.readLine()) { VSMVector vsm = new VSMVector(line, stoplist); tempVSM.add(vsm); Iterator iterator1 = vsm.getElements(); while (iterator1.hasNext()) { VSMElement elem = (VSMElement) iterator1.next(); VSMElement value = (VSMElement) df.get(elem.getWord()); if (value == null) { df.put(elem.getWord(), new VSMElement(elem.getWord(), 1)); } else { df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1)); } } // System.out.println(vsm.toString()); } } catch (IOException ex) { ex.printStackTrace(); } VSMVector[] examples = new VSMVector[tempVSM.size()]; tempVSM.toArray(examples); return examples; }
public String[] centroid2Weka(String output) throws FileNotFoundException, IOException { // FileOutputStream fout = new FileOutputStream(output,false); // DataOutputStream dout = new DataOutputStream( fout ); OutputStream fout = new FileOutputStream(output, false); OutputStream bout = new BufferedOutputStream(fout); OutputStreamWriter outputFile = new OutputStreamWriter(bout); StringBuffer header = new StringBuffer(); header.append("@RELATION TSFC"); header.append("\n"); header.append("\n"); StringBuffer tail = new StringBuffer(); // Iterator iterator1 = centroidForm.getElements(); // Vector bestWordsForm = new Vector(); // while (iterator1.hasNext()) { // bestWordsForm.add(iterator1.next()); // } Vector bestWordsForm = new Vector(df.values()); Collections.sort(bestWordsForm, new VSMElementComparator()); // for (int i = 0; i < 10; i++) { // VSMElement elem1 = (VSMElement)bestWordsForm.elementAt(i); // System.out.println(elem1.getWord()); // System.out.println(elem1.getWeight()); // } // System.out.println("Print header..."); for (int i = 0; i <= numOfFeatures && i < bestWordsForm.size(); i++) { VSMElement elem = (VSMElement) bestWordsForm.elementAt(i); if (elem.getWeight() > minDF) { header.append("@ATTRIBUTE "); header.append(elem.getWord()); attributes.add(elem.getWord()); header.append(" REAL"); header.append("\n"); } } header.append("@ATTRIBUTE class {S,NS}"); tail.append("\n"); tail.append("\n"); tail.append("@DATA"); tail.append("\n"); // System.out.println("Print positive..."); for (int i = 0; i < positiveExamples.length; i++) { VSMVector formTemp = positiveExamples[i]; // formTemp.squaredNormalization(); tail.append("{"); for (int j = 0; j < attributes.size(); j++) { VSMElement elemForm = formTemp.getElement(attributes.elementAt(j)); if (elemForm != null) { tail.append(j); tail.append(" "); tail.append((int) elemForm.getWeight()); tail.append(","); } } tail.append(attributes.size() + " S}"); tail.append("\n"); } // System.out.println("Print negative..."); for (int i = 0; i < negativeExamples.length; i++) { VSMVector formTemp = negativeExamples[i]; // formTemp.squaredNormalization(); tail.append("{"); for (int j = 0; j < attributes.size(); j++) { VSMElement elemForm = formTemp.getElement(attributes.elementAt(j)); if (elemForm != null) { tail.append(j); tail.append(" "); tail.append((int) elemForm.getWeight()); tail.append(","); } } tail.append(attributes.size() + " NS}"); tail.append("\n"); } outputFile.write(header.toString()); outputFile.flush(); outputFile.write(tail.toString()); outputFile.close(); if (positiveTestExamples != null) { createTestFile(output, bestWordsForm, header); } String[] atts = new String[attributes.size()]; attributes.toArray(atts); return atts; }