public void createInstances(String file, Instances is) throws Exception { CONLLReader09 depReader = new CONLLReader09(file); mf.register(REL, "<root-type>"); // register at least one predicate since the parsing data might not contain predicates as in // the Japaness corpus but the development sets contains some long sl = 0; System.out.print("Registering feature parts of sentence: "); int ic = 0; int del = 0; while (true) { SentenceData09 instance = depReader.getNext(); if (instance == null) break; ic++; sl += instance.labels.length; if (ic % 1000 == 0) { del = outValue(ic, del); } String[] labs1 = instance.labels; for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); String[] w = instance.forms; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); if (instance.feats != null) { String fs[][] = instance.feats; for (int i1 = 0; i1 < fs.length; i1++) { w = fs[i1]; if (w == null) continue; for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); } } if ((ic - 1) > options.count) break; } del = outValue(ic, del); System.out.println(); Extractor.initFeatures(); Extractor.maxForm = mf.getFeatureCounter().get(WORD); if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); mf.calculateBits(); Extractor.initStat(options.featureCreation); System.out.println("" + mf.toString()); for (Extractor e : extractor) e.init(); depReader.startReading(file); int num1 = 0; is.init(ic, new MFO()); Edges.init(mf.getFeatureCounter().get(POS)); System.out.print("Creating edge filters and read corpus: "); del = 0; while (true) { if (num1 % 100 == 0) del = outValue(num1, del); SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; int last = is.size() - 1; short[] pos = is.pposs[last]; for (int k = 0; k < is.length(last); k++) { if (is.heads[last][k] < 0) continue; Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); } if (!options.allFeatures && num1 > options.count) break; num1++; } del = outValue(num1, del); System.out.println(); Edges.findDefault(); }
public void createInstances(String file, Instances is) // throws Exception { CONLLReader09 depReader = new CONLLReader09(file); mf.register(REL, "<root-type>"); // register at least one predicate since the parsing data might not contain predicates as in // the Japaness corpus but the development sets contains some long sl = 0; System.out.print("Registering feature parts of sentence: "); int ic = 0; int del = 0; while (true) { SentenceData09 instance = depReader.getNext(); if (instance == null) break; ic++; sl += instance.labels.length; if (ic % 1000 == 0) { del = outValue(ic, del); } String[] labs1 = instance.labels; for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); String[] w = instance.forms; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); if (instance.feats != null) { String fs[][] = instance.feats; for (int i1 = 0; i1 < fs.length; i1++) { w = fs[i1]; if (w == null) continue; for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); } } if ((ic - 1) > options.count) break; } del = outValue(ic, del); System.out.println(); ExtractorReranker.initFeatures(); ExtractorReranker.maxForm = mf.getFeatureCounter().get(WORD); if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); mf.calculateBits(); extractor.initStat(); System.out.println("" + mf.toString()); extractor.init(); depReader.startReading(file); int num1 = 0; is.init(ic, new MFB()); Edges.init(mf.getFeatureCounter().get(POS)); del = 0; del = outValue(num1, del); System.out.println(); }