public DataFES fillVector(F2SF params, Instances is, int inst, DataFES d, Cluster cluster) throws InterruptedException { long ts = System.nanoTime(); if (executerService.isShutdown()) executerService = java.util.concurrent.Executors.newCachedThreadPool(); final int length = is.length(inst); if (d == null || d.len < length) d = new DataFES(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue()); ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); for (int i = 0; i < Parser.THREADS; i++) pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster)); for (int w1 = 0; w1 < length; w1++) { for (int w2 = w1 + 1; w2 < length; w2++) { if (w1 == w2) continue; ParallelExtract.add(w1, w2); } } // for(int i=0;i<efp.length;i++) efp[i].start(); // for(int i=0;i<efp.length;i++) efp[i].join(); executerService.invokeAll(pe); timeExtract += (System.nanoTime() - ts); return d; }
public void initValues() { z = new D4(li); x = new D4(li); x.a0 = s_type; s_pos = mf.getFeatureCounter().get(POS).intValue(); // mf.getFeatureBits(POS); s_word = mf.getFeatureCounter().get(WORD); s_type = mf.getFeatureCounter().get(TYPE).intValue(); // mf.getFeatureBits(TYPE); s_char = mf.getFeatureCounter().get(CHAR).intValue(); // mf.getFeatureBits(CHAR); s_oper = mf.getFeatureCounter().get(OPERATION).intValue(); // mf.getFeatureBits(OPERATION); types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)]; for (Entry<String, Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) types[e.getValue()] = e.getKey(); // wds = new String[mf.getFeatureCounter().get(Pipe.WORD)]; // for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()] // = e.getKey(); z.a0 = s_type; z.a1 = s_oper; z.a2 = s_char; z.a3 = s_char; z.a4 = s_char; z.a5 = s_char; z.a6 = s_char; z.a7 = s_char; x.a0 = s_type; x.a1 = s_oper; x.a2 = s_word; x.a3 = s_word; x.a4 = s_word; x.a5 = s_char; x.a6 = s_char; x.a7 = s_char; }
public void createInstances(String file, Instances is) throws Exception { CONLLReader09 depReader = new CONLLReader09(file); mf.register(REL, "<root-type>"); // register at least one predicate since the parsing data might not contain predicates as in // the Japaness corpus but the development sets contains some long sl = 0; System.out.print("Registering feature parts of sentence: "); int ic = 0; int del = 0; while (true) { SentenceData09 instance = depReader.getNext(); if (instance == null) break; ic++; sl += instance.labels.length; if (ic % 1000 == 0) { del = outValue(ic, del); } String[] labs1 = instance.labels; for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); String[] w = instance.forms; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); if (instance.feats != null) { String fs[][] = instance.feats; for (int i1 = 0; i1 < fs.length; i1++) { w = fs[i1]; if (w == null) continue; for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); } } if ((ic - 1) > options.count) break; } del = outValue(ic, del); System.out.println(); Extractor.initFeatures(); Extractor.maxForm = mf.getFeatureCounter().get(WORD); if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); mf.calculateBits(); Extractor.initStat(options.featureCreation); System.out.println("" + mf.toString()); for (Extractor e : extractor) e.init(); depReader.startReading(file); int num1 = 0; is.init(ic, new MFO()); Edges.init(mf.getFeatureCounter().get(POS)); System.out.print("Creating edge filters and read corpus: "); del = 0; while (true) { if (num1 % 100 == 0) del = outValue(num1, del); SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; int last = is.size() - 1; short[] pos = is.pposs[last]; for (int k = 0; k < is.length(last); k++) { if (is.heads[last][k] < 0) continue; Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); } if (!options.allFeatures && num1 > options.count) break; num1++; } del = outValue(num1, del); System.out.println(); Edges.findDefault(); }
/** * Initialize the features. * * @param maxFeatures */ public void initFeatures() { for (int k = 0; k < 50; k++) { mf.register(TYPE, "F" + k); } _f0 = mf.register(TYPE, _F0); _f1 = mf.register(TYPE, _F1); _f2 = mf.register(TYPE, _F2); _f3 = mf.register(TYPE, _F3); _f4 = mf.register(TYPE, _F4); _f5 = mf.register(TYPE, _F5); _f6 = mf.register(TYPE, _F6); _f7 = mf.register(TYPE, _F7); _f8 = mf.register(TYPE, _F8); _f9 = mf.register(TYPE, _F9); _f10 = mf.register(TYPE, _F10); _f11 = mf.register(TYPE, _F11); _f12 = mf.register(TYPE, _F12); _f13 = mf.register(TYPE, _F13); _f14 = mf.register(TYPE, _F14); _f15 = mf.register(TYPE, _F15); _f16 = mf.register(TYPE, _F16); _f17 = mf.register(TYPE, _F17); _f18 = mf.register(TYPE, _F18); _f19 = mf.register(TYPE, _F19); _f20 = mf.register(TYPE, _F20); _f21 = mf.register(TYPE, _F21); _f22 = mf.register(TYPE, _F22); _f23 = mf.register(TYPE, _F23); _f24 = mf.register(TYPE, _F24); _f25 = mf.register(TYPE, _F25); _f26 = mf.register(TYPE, _F26); _f27 = mf.register(TYPE, _F27); _f28 = mf.register(TYPE, _F28); _f29 = mf.register(TYPE, _F29); _f30 = mf.register(TYPE, _F30); _f31 = mf.register(TYPE, _F31); _f32 = mf.register(TYPE, _F32); _f33 = mf.register(TYPE, _F33); _f34 = mf.register(TYPE, _F34); _f35 = mf.register(TYPE, _F35); _f36 = mf.register(TYPE, _F36); _f37 = mf.register(TYPE, _F37); _f38 = mf.register(TYPE, _F38); mf.register(POS, MID); mf.register(POS, STR); mf.register(POS, END); mf.register(TYPE, CHAR); _swrd = mf.register(WORD, STR); _ewrd = mf.register(WORD, END); _CEND = mf.register(CHAR, END); }
private void registerChars(String type, String word) { for (int i = 0; i < word.length(); i++) mf.register(type, Character.toString(word.charAt(i))); }
public InstancesTagger createInstances(String file) { InstancesTagger is = new InstancesTagger(); depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); depReader.startReading(file); mf.register(REL, "<root-type>"); mf.register(POS, "<root-POS>"); System.out.print("Registering feature parts "); HashMap<String, Integer> ops = new HashMap<String, Integer>(); HashMap<String, HashSet<String>> op2form = new HashMap<String, HashSet<String>>(); int ic = 0; int del = 0; HashSet<String> rm = new HashSet<String>(); while (true) { SentenceData09 instance1 = depReader.getNext(); if (instance1 == null) break; ic++; if (ic % 100 == 0) { del = outValue(ic, del); } String[] labs1 = instance1.labels; for (int i1 = 0; i1 < labs1.length; i1++) { // typeAlphabet.lookupIndex(labs1[i1]); mf.register(REL, labs1[i1]); } String[] w = instance1.forms; for (int i1 = 0; i1 < w.length; i1++) { // saw the first time? if (mf.getValue(WORD, w[i1].toLowerCase()) == -1) opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]); mf.register(WORD, w[i1].toLowerCase()); } for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); w = instance1.lemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); w = instance1.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); for (int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); w = instance1.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance1.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); for (int i1 = 1; i1 < w.length; i1++) { String op = getOperation(instance1, i1); if (ops.get(op) == null) ops.put(op, 1); else { ops.put(op, (ops.get(op) + 1)); if (ops.get(op) > 4) rm.add(instance1.forms[i1].toLowerCase()); } HashSet<String> forms = op2form.get(op); if (forms == null) { forms = new HashSet<String>(); op2form.put(op, forms); } forms.add(instance1.forms[i1].toLowerCase()); } } int countFreqSingleMappings = 0; int sc = 0; ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); for (Entry<String, Integer> e : ops.entrySet()) { // do not use scripts for infrequent cases or frequent single mappings (der -> die) if (e.getValue() > _MIN_OCCURENT_FOR_SCRIPT_USE && op2form.get(e.getKey()).size() > _MIN_WORDS_MAPPED_BY_SCRIPT) { mf.register(OPERATION, e.getKey()); sc++; opsl.add(e); } else { // do not remove the infrequent cases rm.removeAll(op2form.get(e.getKey())); if (op2form.get(e.getKey()).size() <= 1) countFreqSingleMappings += op2form.get(e.getKey()).size(); } } for (String k : rm) { opse.remove(k); } Collections.sort( opsl, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; } }); for (Entry<String, Integer> e : opsl) { // System.out.println(e.getKey()+" "+e.getValue()); } if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); System.out.println("\nfound scripts " + ops.size() + " used scripts " + sc); System.out.println("found mappings of single words " + countFreqSingleMappings); System.out.println("use word maps instead of scripts " + this.opse.size()); // System.out.println(" "+opse); System.out.println("" + mf.toString()); initFeatures(); mf.calculateBits(); initValues(); depReader.startReading(options.trainfile); int i = 0; long start1 = System.currentTimeMillis(); System.out.print("Creating Features: "); is.init(ic, mf); del = 0; while (true) { try { if (i % 100 == 0) { del = outValue(i, del); } SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; is.fillChars(instance1, i, _CEND); if (i > options.count) break; i++; } catch (Exception e) { DB.println("error in sentnence " + i); e.printStackTrace(); } } long end1 = System.currentTimeMillis(); System.gc(); long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); System.out.print(" time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb"); types = new String[mf.getFeatureCounter().get(OPERATION)]; for (Entry<String, Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) { types[e.getValue()] = e.getKey(); // System.out.println("set pos "+e.getKey()); } System.out.println("Num Features: " + mf.size()); return is; }