public static Instances getInstances(String file) throws Exception { DataSource datasource = new DataSource(file); Instances data = datasource.getDataSet(); System.out.println("Class index is : " + data.classIndex()); if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); return data; }
public static void run(String[] args) throws Exception { /** * ************************************************* * * @param args[0]: train arff path * @param args[1]: test arff path */ DataSource source = new DataSource(args[0]); Instances data = source.getDataSet(); data.setClassIndex(data.numAttributes() - 1); NaiveBayes model = new NaiveBayes(); model.buildClassifier(data); // Evaluation: Evaluation eval = new Evaluation(data); Instances testData = new DataSource(args[1]).getDataSet(); testData.setClassIndex(testData.numAttributes() - 1); eval.evaluateModel(model, testData); System.out.println(model.toString()); System.out.println(eval.toSummaryString("\nResults\n======\n", false)); System.out.println("======\nConfusion Matrix:"); double[][] confusionM = eval.confusionMatrix(); for (int i = 0; i < confusionM.length; ++i) { for (int j = 0; j < confusionM[i].length; ++j) { System.out.format("%10s ", confusionM[i][j]); } System.out.print("\n"); } }
Instances getinstance(String s) throws Exception { DataSource source = new DataSource(s); Instances data = source.getDataSet(); // System.out.println(data); // System.out.println("**************"); return data; }
/** * loads the given dataset and prints the Capabilities necessary to process it. * * <p>Valid parameters: * * <p>-file filename <br> * the file to load * * <p>-c index the explicit index of the class attribute (default: none) * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main(String[] args) throws Exception { String tmpStr; String filename; DataSource source; Instances data; int classIndex; Capabilities cap; Iterator iter; if (args.length == 0) { System.out.println( "\nUsage: " + Capabilities.class.getName() + " -file <dataset> [-c <class index>]\n"); return; } // get parameters tmpStr = Utils.getOption("file", args); if (tmpStr.length() == 0) throw new Exception("No file provided with option '-file'!"); else filename = tmpStr; tmpStr = Utils.getOption("c", args); if (tmpStr.length() != 0) { if (tmpStr.equals("first")) classIndex = 0; else if (tmpStr.equals("last")) classIndex = -2; // last else classIndex = Integer.parseInt(tmpStr) - 1; } else { classIndex = -3; // not set } // load data source = new DataSource(filename); if (classIndex == -3) data = source.getDataSet(); else if (classIndex == -2) data = source.getDataSet(source.getStructure().numAttributes() - 1); else data = source.getDataSet(classIndex); // determine and print capabilities cap = forInstances(data); System.out.println("File: " + filename); System.out.println( "Class index: " + ((data.classIndex() == -1) ? "not set" : "" + (data.classIndex() + 1))); System.out.println("Capabilities:"); iter = cap.capabilities(); while (iter.hasNext()) System.out.println("- " + iter.next()); }
public void filterData() throws Exception { Instances data = source.getDataSet(); StringToWordVector stv = new StringToWordVector(); stv.setOptions( weka.core.Utils.splitOptions( "-R first-last -W 1000 " + "-prune-rate -1.0 -N 0 " + "-stemmer weka.core.stemmers.NullStemmer -M 1 " + "-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\r\\n\\t.,;:\\\'\\\"()?!\"")); stv.setInputFormat(data); Instances newdata = Filter.useFilter(data, stv); this.inst = newdata; this.inst.setClassIndex(0); }
private static LinkedList<String> getData(String f) { LinkedList<String> all_tweets = new LinkedList<String>(); DataSource ds; Instances data = null; try { ds = new DataSource(main_folder + "test_sets/" + f + ".arff"); data = ds.getDataSet(); } catch (Exception e) { System.out.println("File not found."); } for (int i = 0; i < data.numInstances(); i++) { all_tweets.add(data.get(i).stringValue(0)); } return all_tweets; }
/** * takes a dataset as first argument * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main(String[] args) throws Exception { // load data System.out.println("\n0. Loading data"); DataSource source = new DataSource(args[0]); Instances data = source.getDataSet(); if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); // 1. meta-classifier useClassifier(data); // 2. filter useFilter(data); // 3. low-level useLowLevel(data); }
@Override public void crossValidation(String traindata) throws Exception { DataSource ds = new DataSource(traindata); Instances instances = ds.getDataSet(); StringToWordVector stv = new StringToWordVector(); stv.setOptions( weka.core.Utils.splitOptions( "-R first-last -W 1000 " + "-prune-rate -1.0 -N 0 " + "-stemmer weka.core.stemmers.NullStemmer -M 1 " + "-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\r\\n\\t.,;:\\\'\\\"()?!\"")); stv.setInputFormat(instances); instances = Filter.useFilter(instances, stv); instances.setClassIndex(0); Evaluation eval = new Evaluation(instances); eval.crossValidateModel(this.classifier, instances, 10, new Random(1)); System.out.println(eval.toSummaryString()); System.out.println(eval.toMatrixString()); }
public void initialize() throws ResourceInitializationException { try { String arffHeaderFileName = (String) getConfigParameterValue(PARAM_ARFF_HEADER_FILE_NAME); DataSource source = new DataSource(arffHeaderFileName); wekaInstances = source.getDataSet(); System.out.println( "Weka Instances successfully instantiated from header file at " + arffHeaderFileName); String arffDataFileName = (String) getConfigParameterValue(PARAM_ARFF_DATA_FILE_NAME); dataFile = new File(arffDataFileName); if (!dataFile.exists()) { dataFile.createNewFile(); System.out.println("ARFF data file created at " + dataFile.getPath()); } else { System.out.println("ARFF data file opened at " + dataFile.getPath()); } } catch (Exception ioe) { throw new ResourceInitializationException(ioe); } }
private static void run() throws Exception { DataSource source = new DataSource("src/files/powerpuffgirls.arff"); int folds = 10; int runs = 30; HashMap<String, Classifier> hash = new HashMap<>(); hash.put("J48", new J48()); hash.put("NaiveBayes", new NaiveBayes()); hash.put("IBk=1", new IBk(1)); hash.put("IBk=3", new IBk(3)); hash.put("MultilayerPerceptron", new MultilayerPerceptron()); // LibSVM svm = new LibSVM(); // svm.setOptions(new String[]{"-S 0 -K 2 -D 3 -G 0.0 -R 0.0 -N 0.5 -M 0.40 -C 1.0 -E // 0.001 -P 0.1"}); // hash.put("LibSVM", svm); Instances data = source.getDataSet(); data.setClassIndex(4); System.out.println("#seed \t correctly instances \t percentage of corrects\n"); for (Entry<String, Classifier> entry : hash.entrySet()) { System.out.println("\n Algorithm: " + entry.getKey() + "\n"); for (int i = 1; i <= runs; i++) { Evaluation eval = new Evaluation(data); eval.crossValidateModel(entry.getValue(), data, folds, new Random(i)); System.out.println(summary(eval)); } } }
@Override public void loadData(String data) throws Exception { this.source = new DataSource(data); this.inst = source.getDataSet(); if (this.inst.classIndex() == -1) this.inst.setClassIndex(this.inst.numAttributes() - 1); }
public static void main(String args[]) { Timers timer = new Timers(); try { // Get the data set path. String referenceFile = Utils.getOption('r', args); String queryFile = Utils.getOption('q', args); if (referenceFile.length() == 0) throw new IllegalArgumentException( "Required option: File containing" + "the reference dataset."); // Load input dataset. DataSource source = new DataSource(referenceFile); Instances referenceData = source.getDataSet(); Instances queryData = null; if (queryFile.length() != 0) { source = new DataSource(queryFile); queryData = source.getDataSet(); } timer.StartTimer("total_time"); // Get all the parameters. String leafSize = Utils.getOption('l', args); String neighbors = Utils.getOption('k', args); // Validate options. int k = 0; if (neighbors.length() == 0) { throw new IllegalArgumentException( "Required option: Number of " + "furthest neighbors to find."); } else { k = Integer.parseInt(neighbors); if (k < 1 || k > referenceData.numInstances()) throw new IllegalArgumentException("[Fatal] Invalid k"); } int l = 20; if (leafSize.length() != 0) l = Integer.parseInt(leafSize); // Create KDTree. KDTree tree = new KDTree(); tree.setMaxInstInLeaf(l); tree.setInstances(referenceData); // Perform All K-Nearest-Neighbors. if (queryFile.length() != 0) { for (int i = 0; i < queryData.numInstances(); i++) { Instances out = tree.kNearestNeighbours(queryData.instance(i), k); } } else { for (int i = 0; i < referenceData.numInstances(); i++) { Instances out = tree.kNearestNeighbours(referenceData.instance(i), k); } } timer.StopTimer("total_time"); timer.PrintTimer("total_time"); } catch (IOException e) { System.err.println(USAGE); } catch (Exception e) { e.printStackTrace(); } }
public WekaClassifier() throws Exception { DataSource source_train = new DataSource("files/train1.arff"); DataSource source_test = new DataSource("files/test1.arff"); _train = source_train.getDataSet(); _test = source_test.getDataSet(); }
private double[] classify(String test) { String[] lab = { "I.2", "I.3", "I.5", "I.6", "I.2.1", "I.2.6", "I.2.8", "I.3.5", "I.3.6", "I.3.7", "I.5.1", "I.5.2", "I.5.4", "I.6.3", "I.6.5", "I.6.8", }; int NSel = 1000; // Number of selection Filter[] filters = new Filter[2]; double[] x = new double[16]; double[] prd = new double[16]; double clsLabel; Ranker rank = new Ranker(); Evaluation eval = null; StringToWordVector stwv = new StringToWordVector(); weka.filters.supervised.attribute.AttributeSelection featSel = new weka.filters.supervised.attribute.AttributeSelection(); WordTokenizer wtok = new WordTokenizer(); String delim = " \r\n\t.,;:'\"()?!$*-&[]+/|\\"; InfoGainAttributeEval ig = new InfoGainAttributeEval(); String[] stwvOpts; wtok.setDelimiters(delim); Instances[] dataRaw = new Instances[10000]; DataSource[] source = new DataSource[16]; String str; Instances testset = null; DataSource testsrc = null; try { testsrc = new DataSource(test); testset = testsrc.getDataSet(); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } for (int j = 0; j < 16; j++) // 16 element 0-15 { try { str = lab[j]; source[j] = new DataSource( "D:/Users/nma1g11/workspace2/WebScraperFlatNew/dataPernode/new/" + str + ".arff"); dataRaw[j] = source[j].getDataSet(); } catch (Exception e) { e.printStackTrace(); } System.out.println(lab[j]); if (dataRaw[j].classIndex() == -1) dataRaw[j].setClassIndex(dataRaw[j].numAttributes() - 1); } if (testset.classIndex() == -1) testset.setClassIndex(testset.numAttributes() - 1); try { stwvOpts = weka.core.Utils.splitOptions( "-R first-last -W 1000000 -prune-rate -1.0 -C -T -I -N 1 -L -S -stemmer weka.core.stemmers.LovinsStemmer -M 2 "); stwv.setOptions(stwvOpts); stwv.setTokenizer(wtok); rank.setOptions(weka.core.Utils.splitOptions("-T -1.7976931348623157E308 -N 100")); rank.setNumToSelect(NSel); featSel.setEvaluator(ig); featSel.setSearch(rank); } catch (Exception e) { e.printStackTrace(); } filters[0] = stwv; filters[1] = featSel; System.out.println("Loading is Done!"); MultiFilter mfilter = new MultiFilter(); mfilter.setFilters(filters); FilteredClassifier classify = new FilteredClassifier(); classify.setClassifier( new NaiveBayesMultinomial()); ///////// Algorithm of The Classification ///////// classify.setFilter(mfilter); String ss2 = ""; try { Classifier[] clsArr = new Classifier[16]; clsArr = Classifier.makeCopies(classify, 16); String strcls = ""; List<String> clsList = new ArrayList<String>(); String s = null; String newcls = null; String lb = ""; String prev = ""; boolean flag = false; String Ocls = null; int q = 0; for (int i = 0; i < 16; i++) { for (int k = 0; k < testset.numInstances(); k++) { flag = false; s = testset.instance(k).stringValue(1); clsList.add(s); if (lab[i].equals(s)) { flag = true; newcls = s; } } clsArr[i].buildClassifier(dataRaw[i]); eval = new Evaluation(dataRaw[i]); for (int j = 0; j < testset.numInstances(); j++) { Ocls = testset.instance(j).stringValue(1); if (flag && !s.equals(null)) testset.instance(j).setClassValue(lab[i]); // ----------------------------------------- strcls = testset.instance(j).stringValue(1); if (i < 4) { if (strcls.substring(0, 3).equals(lab[i])) testset.instance(j).setClassValue(lab[i]); } else if (lab[i].substring(0, 3).equals(strcls)) testset.instance(j).setClassValue(lab[i]); // ------------------------------------------------ System.out.println( dataRaw[i].classAttribute().value(i) + " --- > Correct%:" + eval.pctCorrect() + " F-measure:" + eval.fMeasure(i)); if (!prev.equals(testset.instance(j).stringValue(0)) || !lab[i].equals(lb)) { clsLabel = clsArr[i].classifyInstance(testset.instance(j)); x = clsArr[i].distributionForInstance(testset.instance(j)); prd[i] = x[i]; System.out.println(" --- > prob: " + clsLabel); System.out.println(" --- > x :" + x[i]); System.out.println(clsLabel + " --> " + testset.classAttribute().value((int) clsLabel)); } testset.instance(j).setClassValue(Ocls); prev = testset.instance(j).stringValue(0); lb = lab[i]; } System.out.println("Done with " + lab[i].replace("99", "") + " !!!!!!!!!!!"); } System.out.println(eval.correct()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return prd; }
public void generateDataSet() { // Read all the instances in the file (ARFF, CSV, XRFF, ...) try { source = new DataSource("data\\bne.csv"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } // Create data set try { instances = source.getDataSet(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } // Reverse the order of instances in the data set to place them in // chronological order for (int i = 0; i < (instances.numInstances() / 2); i++) { instances.swap(i, instances.numInstances() - 1 - i); } // Remove "volume", "low price", "high price", "opening price" and // "data" from data set instances.deleteAttributeAt(instances.numAttributes() - 1); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); // Create list to hold nominal values "purchase", "sale", "retain" List my_nominal_values = new ArrayList(3); my_nominal_values.add("purchase"); my_nominal_values.add("sale"); my_nominal_values.add("retain"); // Create nominal attribute "classIndex" Attribute classIndex = new Attribute("classIndex", my_nominal_values); // Add "classIndex" as an attribute to each instance instances.insertAttributeAt(classIndex, instances.numAttributes()); // Set the value of "classIndex" for each instance for (int i = 0; i < instances.numInstances() - 1; i++) { if (instances.get(i + 1).value(instances.numAttributes() - 2) > instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "purchase"); } else if (instances.get(i + 1).value(instances.numAttributes() - 2) < instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "sale"); } else if (instances.get(i + 1).value(instances.numAttributes() - 2) == instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "retain"); } } // Make the last attribute be the class instances.setClassIndex(instances.numAttributes() - 1); // Calculate and insert technical analysis attributes into data set Strategies strategies = new Strategies(); strategies.applyStrategies(); // Print header and instances System.out.println("\nDataset:\n"); System.out.println(instances); System.out.println(instances.numInstances()); }
public void exec(PrintWriter printer) { try { FileWriter outFile = null; PrintWriter out = null; if (printer == null) { outFile = new FileWriter(id + ".results"); out = new PrintWriter(outFile); } else out = printer; DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + id + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); String cloneID = ""; boolean clonar = false; if (baseline) { System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST BASELINE\t***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); BaselineClassifier base = new BaselineClassifier(testcorpus, 8); precision = base.getPrecision(); recall = base.getRecall(); fmeasure = base.getFmeasure(); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); out.flush(); out.close(); return; } else { System.out.println("Stemming: " + stemming); System.out.println("Lematization:" + lematization); System.out.println("URLs:" + urls); System.out.println("Hashtags:" + hashtags); System.out.println("Mentions:" + mentions); System.out.println("Unigrams:" + unigrams); System.out.println("Bigrams:" + bigrams); System.out.println("TF:" + tf); System.out.println("TF-IDF:" + tfidf); out.println("Stemming: " + stemming); out.println("Lematization:" + lematization); out.println("URLs:" + urls); out.println("Hashtags:" + hashtags); out.println("Mentions:" + mentions); out.println("Unigrams:" + unigrams); out.println("Bigrams:" + bigrams); out.println("TF:" + tf); out.println("TF-IDF:" + tfidf); } // Si tengo los tweets procesados, me evito un nuevo proceso System.out.println("1-Process tweets " + dateFormat.format(new Date())); out.println("1-Process tweets " + dateFormat.format(new Date())); List<ProcessedTweet> train = null; String[] ids = id.split("-"); cloneID = ids[0] + "-" + (Integer.valueOf(ids[1]) + 6); if (((Integer.valueOf(ids[1]) / 6) % 2) == 0) clonar = true; if (new File(id + "-train.ptweets").exists()) { train = ProcessedTweetSerialization.fromFile(id + "-train.ptweets"); tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); if (lematization) { tweetsProcessor.doLematization(train); } if (stemming) { tweetsProcessor.doStemming(train); } } else { tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); // Esto del set training es un añadido para poder diferenciar los idiomas de las url en el // corpus paralelo // tweetsProcessor.setTraining(true); train = tweetsProcessor.processTweets(traincorpus); // tweetsProcessor.setTraining(false); ProcessedTweetSerialization.toFile(id + "-train.ptweets", train); /* if (clonar) { File f = new File (id+"-train.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+12)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+18)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+24)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+30)+"-train.ptweets").toPath(), options); } */ } // Generamos las BOW. Igual que antes, si existen no las creo. System.out.println("2-Fill topics " + dateFormat.format(new Date())); out.println("2-Fill topics " + dateFormat.format(new Date())); TopicsList topics = null; if (new File(id + ".topics").exists()) { topics = TopicsSerialization.fromFile(id + ".topics"); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); topics.prepareTopics(); } else { topics = new TopicsList(); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); System.out.println("Filling topics " + dateFormat.format(new Date())); topics.fillTopics(train); System.out.println("Preparing topics topics " + dateFormat.format(new Date())); // Aquí tengo que serializar antes de preparar, porque si no no puedo calcular los tf y // tfidf System.out.println("Serializing topics topics " + dateFormat.format(new Date())); /* if (clonar) { TopicsSerialization.toFile(cloneID+".topics", topics); } */ topics.prepareTopics(); TopicsSerialization.toFile(id + ".topics", topics); } System.out.println("3-Generate arff train file " + dateFormat.format(new Date())); out.println("3-Generate arff train file " + dateFormat.format(new Date())); // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-train.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-train.arff"); int tope = traincorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, train.get(indTweet)); } bw.flush(); bw.close(); } // Ahora proceso los datos de test System.out.println("5-build test dataset " + dateFormat.format(new Date())); out.println("5-build test dataset " + dateFormat.format(new Date())); List<ProcessedTweet> test = null; if (new File(id + "-test.ptweets").exists()) test = ProcessedTweetSerialization.fromFile(id + "-test.ptweets"); else { if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); test = tweetsProcessor.processTweets(testcorpus); ProcessedTweetSerialization.toFile(id + "-test.ptweets", test); /* if (clonar) { File f = new File (id+"-test.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-test.ptweets").toPath(), options); } */ } // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-test.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-test.arff"); int tope = testcorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, test.get(indTweet)); } bw.flush(); bw.close(); } int topeTopics = topics.getTopicsList().size(); topics.getTopicsList().clear(); // Genero el clasificador // FJRM 25-08-2013 Lo cambio de orden para intentar liberar la memoria de los topics y tener // más libre System.out.println("4-Generate classifier " + dateFormat.format(new Date())); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = null; Instances dataTrain = null; if (new File(id + "-MNB.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id + "-MNB.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { sourceTrain = new DataSource(id + "-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador cls = new weka.classifiers.bayes.NaiveBayesMultinomial(); int clase = dataTrain.numAttributes() - 1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id + "-MNB.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); // data.delete();//no borro para el svm } // Ahora evaluo el clasificador con los datos de test System.out.println("6-Evaluate classifier MNB " + dateFormat.format(new Date())); out.println("6-Evaluate classifier MNB" + dateFormat.format(new Date())); DataSource sourceTest = new DataSource(id + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); int clase = dataTest.numAttributes() - 1; dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); /* NO BORRAR System.out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); if (new File(id+"-SVM.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id+"-SVM.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { if (dataTrain==null) { sourceTrain = new DataSource(id+"-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); } //Entreno el clasificador cls = new weka.classifiers.functions.LibSVM(); clase = dataTrain.numAttributes()-1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id+"-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); dataTrain.delete(); } eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ System.out.println("Done " + dateFormat.format(new Date())); out.println("Done " + dateFormat.format(new Date())); if (printer == null) { out.flush(); out.close(); } // Intento de liberar memoria if (dataTrain != null) dataTrain.delete(); if (dataTest != null) dataTest.delete(); if (train != null) train.clear(); if (test != null) test.clear(); if (topics != null) { topics.getTopicsList().clear(); topics = null; } if (dataTest != null) dataTest.delete(); if (cls != null) cls = null; if (tweetsProcessor != null) tweetsProcessor = null; System.gc(); } catch (Exception e) { e.printStackTrace(); } }
public static void execSVM(String expName) { try { FileWriter outFile = null; PrintWriter out = null; outFile = new FileWriter(expName + "-SVM.results"); out = new PrintWriter(outFile); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + expName + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("***************************************"); out.println("***\tEXECUTING TEST\t" + expName + "***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = new DataSource(expName + "-train.arff"); Instances dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador // cls = new weka.classifiers.functions.LibSVM(); int clase = dataTrain.numAttributes() - 1; cls = new weka.classifiers.bayes.ComplementNaiveBayes(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(expName + "-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); DataSource sourceTest = new DataSource(expName + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion float precision = 0; float recall = 0; float fmeasure = 0; int topeTopics = 8; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ CNB ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ CNB ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR ZeroR cls = new weka.classifiers.rules.ZeroR(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR J48 /* cls = new weka.classifiers.trees.J48(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ J48 ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ J48 ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); //OTRO SMO cls = new weka.classifiers.functions.SMO(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ SMO ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ SMO ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ out.flush(); out.close(); dataTest.delete(); dataTrain.delete(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public HashMap<String, String> process( Sentence sent, String dep, HashSet<String> terms, List<NamedEntity> entities, String author, String aidx) { try { // System.out.println("ML start!"); // System.out.println("List : " + terms); HashMap<String, String> ht = new HashMap<String, String>(); List<NamedEntity> newEntities = new ArrayList<NamedEntity>(); for (NamedEntity entity : entities) { // System.out.println("original: " + entity.entity); boolean check = false; for (NamedEntity temp : entities) { if (entity == temp) continue; if (entity.entity.contains(temp.entity)) { check = true; } } if (!check) newEntities.add(entity); } List<DependencyTriple> dtl = getDependencyTripleList(dep); List<NamedEntity> targetCands = new ArrayList<NamedEntity>(); HashMap<NamedEntity, String> tOpinTerm = new HashMap<NamedEntity, String>(); List<NamedEntity> holderCands = new ArrayList<NamedEntity>(); HashMap<NamedEntity, String> hOpinTerm = new HashMap<NamedEntity, String>(); BufferedWriter writer = new BufferedWriter(new FileWriter("weka_target.csv")); writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,Class\n"); boolean check = false; List<NamedEntity> targetTmp = new ArrayList<NamedEntity>(); for (NamedEntity entity : newEntities) { // System.out.println("extracted: " + entity.entity); String temp = getTargetFeatures(entity, author, terms, dtl); // System.out.println(temp); if (temp.length() > 1) { check = true; writer.write(temp); String[] toks = temp.split("\n"); for (int i = 0; i < toks.length; i++) { targetTmp.add(entity); tOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(","))); } } } writer.close(); if (check) { DataSource source = new DataSource("weka_target.csv"); Instances testdata = source.getDataSet(); testdata.setClassIndex(testdata.numAttributes() - 1); Classifier models = (Classifier) weka.core.SerializationHelper.read("target_smoreg.model"); if (testdata.numInstances() != targetTmp.size()) System.out.println("wrong number of instances"); for (int i = 0; i < testdata.numInstances(); i++) { double pred = models.classifyInstance(testdata.instance(i)); if (pred >= 1.0) { // System.out.println(pred + " , " + targetTmp.get(i).entity); targetCands.add(targetTmp.get(i)); } } } writer = new BufferedWriter(new FileWriter("weka_holder.csv")); writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class\n"); check = false; List<NamedEntity> holderTmp = new ArrayList<NamedEntity>(); for (NamedEntity entity : newEntities) { // System.out.println("extracted: " + entity.entity); String temp = getHolderFeatures(entity, author, terms, dtl); // System.out.println(temp); if (temp.length() > 1) { check = true; writer.write(temp); String[] toks = temp.split("\n"); for (int i = 0; i < toks.length; i++) { holderTmp.add(entity); hOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(","))); } } } writer.close(); if (check) { DataSource source = new DataSource("weka_holder.csv"); Instances testdata = source.getDataSet(); testdata.setClassIndex(testdata.numAttributes() - 1); Classifier models = (Classifier) weka.core.SerializationHelper.read("holder_smoreg.model"); if (testdata.numInstances() != holderTmp.size()) System.out.println("wrong number of instances"); for (int i = 0; i < testdata.numInstances(); i++) { double pred = models.classifyInstance(testdata.instance(i)); if (pred >= 1.0) { // System.out.println(pred + " , " + holderTmp.get(i).entity); holderCands.add(holderTmp.get(i)); } } } if ((targetCands.size() == 0) || (holderCands.size() == 0)) return ht; List<NamedEntity> holderCandTmp = new ArrayList<NamedEntity>(); for (NamedEntity holderCand : holderCands) { boolean hasLonger = false; for (NamedEntity temp : holderCands) { if (temp.entity.compareTo(holderCand.entity) == 0) continue; if (temp.entity.contains(holderCand.entity)) { hasLonger = true; break; } } if (!hasLonger) holderCandTmp.add(holderCand); } List<NamedEntity> targetCandTmp = new ArrayList<NamedEntity>(); for (NamedEntity targetCand : targetCands) { boolean hasLonger = false; for (NamedEntity temp : targetCands) { if (temp.entity.compareTo(targetCand.entity) == 0) continue; if (temp.entity.contains(targetCand.entity)) { hasLonger = true; break; } } if (!hasLonger) targetCandTmp.add(targetCand); } for (NamedEntity targetCand : targetCandTmp) { if (targetCand.entity.compareTo(author) == 0) continue; for (NamedEntity holderCand : holderCandTmp) { if (targetCand.entity.compareTo(holderCand.entity) == 0) continue; String targetOpin = tOpinTerm.get(targetCand); String holderOpin = hOpinTerm.get(holderCand); // System.out.println(targetOpin + ", " + holderOpin); if (targetOpin.compareTo(holderOpin) != 0) continue; String opin = targetOpin .concat("\t") .concat( Integer.toString(sent.sent.indexOf(targetOpin) + sent.beg) .concat("-") .concat( Integer.toString( sent.sent.indexOf(targetOpin) + sent.beg + targetOpin.length()))); String holder = holderCand .entity .concat("\t") .concat( Integer.toString(holderCand.beg) .concat("-") .concat(Integer.toString(holderCand.end))); String target = targetCand .entity .concat("\t") .concat( Integer.toString(targetCand.beg) .concat("-") .concat(Integer.toString(targetCand.end))); ht.put(targetOpin, opin.concat("\t").concat(holder).concat("\t").concat(target)); } } return ht; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } }