public void createInstances(String file, Instances is) throws Exception { CONLLReader09 depReader = new CONLLReader09(file); mf.register(REL, "<root-type>"); // register at least one predicate since the parsing data might not contain predicates as in // the Japaness corpus but the development sets contains some long sl = 0; System.out.print("Registering feature parts of sentence: "); int ic = 0; int del = 0; while (true) { SentenceData09 instance = depReader.getNext(); if (instance == null) break; ic++; sl += instance.labels.length; if (ic % 1000 == 0) { del = outValue(ic, del); } String[] labs1 = instance.labels; for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); String[] w = instance.forms; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); if (instance.feats != null) { String fs[][] = instance.feats; for (int i1 = 0; i1 < fs.length; i1++) { w = fs[i1]; if (w == null) continue; for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); } } if ((ic - 1) > options.count) break; } del = outValue(ic, del); System.out.println(); Extractor.initFeatures(); Extractor.maxForm = mf.getFeatureCounter().get(WORD); if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); mf.calculateBits(); Extractor.initStat(options.featureCreation); System.out.println("" + mf.toString()); for (Extractor e : extractor) e.init(); depReader.startReading(file); int num1 = 0; is.init(ic, new MFO()); Edges.init(mf.getFeatureCounter().get(POS)); System.out.print("Creating edge filters and read corpus: "); del = 0; while (true) { if (num1 % 100 == 0) del = outValue(num1, del); SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; int last = is.size() - 1; short[] pos = is.pposs[last]; for (int k = 0; k < is.length(last); k++) { if (is.heads[last][k] < 0) continue; Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); } if (!options.allFeatures && num1 > options.count) break; num1++; } del = outValue(num1, del); System.out.println(); Edges.findDefault(); }
public InstancesTagger createInstances(String file) { InstancesTagger is = new InstancesTagger(); depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); depReader.startReading(file); mf.register(REL, "<root-type>"); mf.register(POS, "<root-POS>"); System.out.print("Registering feature parts "); HashMap<String, Integer> ops = new HashMap<String, Integer>(); HashMap<String, HashSet<String>> op2form = new HashMap<String, HashSet<String>>(); int ic = 0; int del = 0; HashSet<String> rm = new HashSet<String>(); while (true) { SentenceData09 instance1 = depReader.getNext(); if (instance1 == null) break; ic++; if (ic % 100 == 0) { del = outValue(ic, del); } String[] labs1 = instance1.labels; for (int i1 = 0; i1 < labs1.length; i1++) { // typeAlphabet.lookupIndex(labs1[i1]); mf.register(REL, labs1[i1]); } String[] w = instance1.forms; for (int i1 = 0; i1 < w.length; i1++) { // saw the first time? if (mf.getValue(WORD, w[i1].toLowerCase()) == -1) opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]); mf.register(WORD, w[i1].toLowerCase()); } for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); w = instance1.lemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); w = instance1.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); for (int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); w = instance1.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance1.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); for (int i1 = 1; i1 < w.length; i1++) { String op = getOperation(instance1, i1); if (ops.get(op) == null) ops.put(op, 1); else { ops.put(op, (ops.get(op) + 1)); if (ops.get(op) > 4) rm.add(instance1.forms[i1].toLowerCase()); } HashSet<String> forms = op2form.get(op); if (forms == null) { forms = new HashSet<String>(); op2form.put(op, forms); } forms.add(instance1.forms[i1].toLowerCase()); } } int countFreqSingleMappings = 0; int sc = 0; ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); for (Entry<String, Integer> e : ops.entrySet()) { // do not use scripts for infrequent cases or frequent single mappings (der -> die) if (e.getValue() > _MIN_OCCURENT_FOR_SCRIPT_USE && op2form.get(e.getKey()).size() > _MIN_WORDS_MAPPED_BY_SCRIPT) { mf.register(OPERATION, e.getKey()); sc++; opsl.add(e); } else { // do not remove the infrequent cases rm.removeAll(op2form.get(e.getKey())); if (op2form.get(e.getKey()).size() <= 1) countFreqSingleMappings += op2form.get(e.getKey()).size(); } } for (String k : rm) { opse.remove(k); } Collections.sort( opsl, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; } }); for (Entry<String, Integer> e : opsl) { // System.out.println(e.getKey()+" "+e.getValue()); } if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); System.out.println("\nfound scripts " + ops.size() + " used scripts " + sc); System.out.println("found mappings of single words " + countFreqSingleMappings); System.out.println("use word maps instead of scripts " + this.opse.size()); // System.out.println(" "+opse); System.out.println("" + mf.toString()); initFeatures(); mf.calculateBits(); initValues(); depReader.startReading(options.trainfile); int i = 0; long start1 = System.currentTimeMillis(); System.out.print("Creating Features: "); is.init(ic, mf); del = 0; while (true) { try { if (i % 100 == 0) { del = outValue(i, del); } SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; is.fillChars(instance1, i, _CEND); if (i > options.count) break; i++; } catch (Exception e) { DB.println("error in sentnence " + i); e.printStackTrace(); } } long end1 = System.currentTimeMillis(); System.gc(); long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); System.out.print(" time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb"); types = new String[mf.getFeatureCounter().get(OPERATION)]; for (Entry<String, Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) { types[e.getValue()] = e.getKey(); // System.out.println("set pos "+e.getKey()); } System.out.println("Num Features: " + mf.size()); return is; }