public void initValues() { z = new D4(li); x = new D4(li); x.a0 = s_type; s_pos = mf.getFeatureCounter().get(POS).intValue(); // mf.getFeatureBits(POS); s_word = mf.getFeatureCounter().get(WORD); s_type = mf.getFeatureCounter().get(TYPE).intValue(); // mf.getFeatureBits(TYPE); s_char = mf.getFeatureCounter().get(CHAR).intValue(); // mf.getFeatureBits(CHAR); s_oper = mf.getFeatureCounter().get(OPERATION).intValue(); // mf.getFeatureBits(OPERATION); types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)]; for (Entry<String, Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) types[e.getValue()] = e.getKey(); // wds = new String[mf.getFeatureCounter().get(Pipe.WORD)]; // for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()] // = e.getKey(); z.a0 = s_type; z.a1 = s_oper; z.a2 = s_char; z.a3 = s_char; z.a4 = s_char; z.a5 = s_char; z.a6 = s_char; z.a7 = s_char; x.a0 = s_type; x.a1 = s_oper; x.a2 = s_word; x.a3 = s_word; x.a4 = s_word; x.a5 = s_char; x.a6 = s_char; x.a7 = s_char; }
public InstancesTagger createInstances(String file) { InstancesTagger is = new InstancesTagger(); depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); depReader.startReading(file); mf.register(REL, "<root-type>"); mf.register(POS, "<root-POS>"); System.out.print("Registering feature parts "); HashMap<String, Integer> ops = new HashMap<String, Integer>(); HashMap<String, HashSet<String>> op2form = new HashMap<String, HashSet<String>>(); int ic = 0; int del = 0; HashSet<String> rm = new HashSet<String>(); while (true) { SentenceData09 instance1 = depReader.getNext(); if (instance1 == null) break; ic++; if (ic % 100 == 0) { del = outValue(ic, del); } String[] labs1 = instance1.labels; for (int i1 = 0; i1 < labs1.length; i1++) { // typeAlphabet.lookupIndex(labs1[i1]); mf.register(REL, labs1[i1]); } String[] w = instance1.forms; for (int i1 = 0; i1 < w.length; i1++) { // saw the first time? if (mf.getValue(WORD, w[i1].toLowerCase()) == -1) opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]); mf.register(WORD, w[i1].toLowerCase()); } for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); w = instance1.lemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); w = instance1.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); for (int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); w = instance1.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance1.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); for (int i1 = 1; i1 < w.length; i1++) { String op = getOperation(instance1, i1); if (ops.get(op) == null) ops.put(op, 1); else { ops.put(op, (ops.get(op) + 1)); if (ops.get(op) > 4) rm.add(instance1.forms[i1].toLowerCase()); } HashSet<String> forms = op2form.get(op); if (forms == null) { forms = new HashSet<String>(); op2form.put(op, forms); } forms.add(instance1.forms[i1].toLowerCase()); } } int countFreqSingleMappings = 0; int sc = 0; ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); for (Entry<String, Integer> e : ops.entrySet()) { // do not use scripts for infrequent cases or frequent single mappings (der -> die) if (e.getValue() > _MIN_OCCURENT_FOR_SCRIPT_USE && op2form.get(e.getKey()).size() > _MIN_WORDS_MAPPED_BY_SCRIPT) { mf.register(OPERATION, e.getKey()); sc++; opsl.add(e); } else { // do not remove the infrequent cases rm.removeAll(op2form.get(e.getKey())); if (op2form.get(e.getKey()).size() <= 1) countFreqSingleMappings += op2form.get(e.getKey()).size(); } } for (String k : rm) { opse.remove(k); } Collections.sort( opsl, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; } }); for (Entry<String, Integer> e : opsl) { // System.out.println(e.getKey()+" "+e.getValue()); } if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); System.out.println("\nfound scripts " + ops.size() + " used scripts " + sc); System.out.println("found mappings of single words " + countFreqSingleMappings); System.out.println("use word maps instead of scripts " + this.opse.size()); // System.out.println(" "+opse); System.out.println("" + mf.toString()); initFeatures(); mf.calculateBits(); initValues(); depReader.startReading(options.trainfile); int i = 0; long start1 = System.currentTimeMillis(); System.out.print("Creating Features: "); is.init(ic, mf); del = 0; while (true) { try { if (i % 100 == 0) { del = outValue(i, del); } SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; is.fillChars(instance1, i, _CEND); if (i > options.count) break; i++; } catch (Exception e) { DB.println("error in sentnence " + i); e.printStackTrace(); } } long end1 = System.currentTimeMillis(); System.gc(); long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); System.out.print(" time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb"); types = new String[mf.getFeatureCounter().get(OPERATION)]; for (Entry<String, Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) { types[e.getValue()] = e.getKey(); // System.out.println("set pos "+e.getKey()); } System.out.println("Num Features: " + mf.size()); return is; }