Example #1
0
  public DataFES fillVector(F2SF params, Instances is, int inst, DataFES d, Cluster cluster)
      throws InterruptedException {

    long ts = System.nanoTime();

    if (executerService.isShutdown())
      executerService = java.util.concurrent.Executors.newCachedThreadPool();

    final int length = is.length(inst);
    if (d == null || d.len < length)
      d = new DataFES(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue());

    ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>();
    for (int i = 0; i < Parser.THREADS; i++)
      pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster));

    for (int w1 = 0; w1 < length; w1++) {
      for (int w2 = w1 + 1; w2 < length; w2++) {

        if (w1 == w2) continue;

        ParallelExtract.add(w1, w2);
      }
    }
    //		for(int i=0;i<efp.length;i++) efp[i].start();
    //		for(int i=0;i<efp.length;i++) efp[i].join();
    executerService.invokeAll(pe);

    timeExtract += (System.nanoTime() - ts);

    return d;
  }
Example #2
0
  public void initValues() {

    z = new D4(li);

    x = new D4(li);
    x.a0 = s_type;

    s_pos = mf.getFeatureCounter().get(POS).intValue(); // mf.getFeatureBits(POS);
    s_word = mf.getFeatureCounter().get(WORD);
    s_type = mf.getFeatureCounter().get(TYPE).intValue(); // mf.getFeatureBits(TYPE);
    s_char = mf.getFeatureCounter().get(CHAR).intValue(); // mf.getFeatureBits(CHAR);
    s_oper = mf.getFeatureCounter().get(OPERATION).intValue(); // mf.getFeatureBits(OPERATION);

    types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)];
    for (Entry<String, Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet())
      types[e.getValue()] = e.getKey();

    // wds  = new String[mf.getFeatureCounter().get(Pipe.WORD)];
    // for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()]
    // = e.getKey();

    z.a0 = s_type;
    z.a1 = s_oper;
    z.a2 = s_char;
    z.a3 = s_char;
    z.a4 = s_char;
    z.a5 = s_char;
    z.a6 = s_char;
    z.a7 = s_char;
    x.a0 = s_type;
    x.a1 = s_oper;
    x.a2 = s_word;
    x.a3 = s_word;
    x.a4 = s_word;
    x.a5 = s_char;
    x.a6 = s_char;
    x.a7 = s_char;
  }
Example #3
0
  public void createInstances(String file, Instances is) throws Exception {

    CONLLReader09 depReader = new CONLLReader09(file);

    mf.register(REL, "<root-type>");

    // register at least one predicate since the parsing data might not contain predicates as in
    // the Japaness corpus but the development sets contains some

    long sl = 0;

    System.out.print("Registering feature parts of sentence: ");
    int ic = 0;
    int del = 0;
    while (true) {
      SentenceData09 instance = depReader.getNext();
      if (instance == null) break;
      ic++;

      sl += instance.labels.length;

      if (ic % 1000 == 0) {
        del = outValue(ic, del);
      }

      String[] labs1 = instance.labels;
      for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]);

      String[] w = instance.forms;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1]));

      w = instance.plemmas;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1]));

      w = instance.ppos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      w = instance.gpos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      if (instance.feats != null) {
        String fs[][] = instance.feats;
        for (int i1 = 0; i1 < fs.length; i1++) {
          w = fs[i1];
          if (w == null) continue;
          for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]);
        }
      }

      if ((ic - 1) > options.count) break;
    }
    del = outValue(ic, del);

    System.out.println();
    Extractor.initFeatures();

    Extractor.maxForm = mf.getFeatureCounter().get(WORD);

    if (options.clusterFile == null) cl = new Cluster();
    else cl = new Cluster(options.clusterFile, mf, 6);

    mf.calculateBits();
    Extractor.initStat(options.featureCreation);

    System.out.println("" + mf.toString());

    for (Extractor e : extractor) e.init();

    depReader.startReading(file);

    int num1 = 0;

    is.init(ic, new MFO());

    Edges.init(mf.getFeatureCounter().get(POS));

    System.out.print("Creating edge filters and read corpus: ");
    del = 0;

    while (true) {
      if (num1 % 100 == 0) del = outValue(num1, del);

      SentenceData09 instance1 = depReader.getNext(is);

      if (instance1 == null) break;

      int last = is.size() - 1;
      short[] pos = is.pposs[last];

      for (int k = 0; k < is.length(last); k++) {
        if (is.heads[last][k] < 0) continue;
        Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]);
        //				Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]);
      }

      if (!options.allFeatures && num1 > options.count) break;

      num1++;
    }
    del = outValue(num1, del);
    System.out.println();
    Edges.findDefault();
  }
Example #4
0
  /**
   * Initialize the features.
   *
   * @param maxFeatures
   */
  public void initFeatures() {

    for (int k = 0; k < 50; k++) {
      mf.register(TYPE, "F" + k);
    }

    _f0 = mf.register(TYPE, _F0);
    _f1 = mf.register(TYPE, _F1);
    _f2 = mf.register(TYPE, _F2);
    _f3 = mf.register(TYPE, _F3);
    _f4 = mf.register(TYPE, _F4);
    _f5 = mf.register(TYPE, _F5);
    _f6 = mf.register(TYPE, _F6);
    _f7 = mf.register(TYPE, _F7);
    _f8 = mf.register(TYPE, _F8);
    _f9 = mf.register(TYPE, _F9);
    _f10 = mf.register(TYPE, _F10);
    _f11 = mf.register(TYPE, _F11);
    _f12 = mf.register(TYPE, _F12);
    _f13 = mf.register(TYPE, _F13);
    _f14 = mf.register(TYPE, _F14);
    _f15 = mf.register(TYPE, _F15);
    _f16 = mf.register(TYPE, _F16);
    _f17 = mf.register(TYPE, _F17);
    _f18 = mf.register(TYPE, _F18);
    _f19 = mf.register(TYPE, _F19);
    _f20 = mf.register(TYPE, _F20);
    _f21 = mf.register(TYPE, _F21);
    _f22 = mf.register(TYPE, _F22);
    _f23 = mf.register(TYPE, _F23);
    _f24 = mf.register(TYPE, _F24);
    _f25 = mf.register(TYPE, _F25);
    _f26 = mf.register(TYPE, _F26);
    _f27 = mf.register(TYPE, _F27);
    _f28 = mf.register(TYPE, _F28);
    _f29 = mf.register(TYPE, _F29);
    _f30 = mf.register(TYPE, _F30);

    _f31 = mf.register(TYPE, _F31);
    _f32 = mf.register(TYPE, _F32);
    _f33 = mf.register(TYPE, _F33);
    _f34 = mf.register(TYPE, _F34);

    _f35 = mf.register(TYPE, _F35);
    _f36 = mf.register(TYPE, _F36);
    _f37 = mf.register(TYPE, _F37);
    _f38 = mf.register(TYPE, _F38);

    mf.register(POS, MID);
    mf.register(POS, STR);
    mf.register(POS, END);
    mf.register(TYPE, CHAR);

    _swrd = mf.register(WORD, STR);
    _ewrd = mf.register(WORD, END);

    _CEND = mf.register(CHAR, END);
  }
Example #5
0
 private void registerChars(String type, String word) {
   for (int i = 0; i < word.length(); i++) mf.register(type, Character.toString(word.charAt(i)));
 }
Example #6
0
  public InstancesTagger createInstances(String file) {

    InstancesTagger is = new InstancesTagger();

    depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE);

    depReader.startReading(file);
    mf.register(REL, "<root-type>");
    mf.register(POS, "<root-POS>");

    System.out.print("Registering feature parts ");
    HashMap<String, Integer> ops = new HashMap<String, Integer>();
    HashMap<String, HashSet<String>> op2form = new HashMap<String, HashSet<String>>();
    int ic = 0;
    int del = 0;
    HashSet<String> rm = new HashSet<String>();

    while (true) {
      SentenceData09 instance1 = depReader.getNext();
      if (instance1 == null) break;
      ic++;
      if (ic % 100 == 0) {
        del = outValue(ic, del);
      }

      String[] labs1 = instance1.labels;
      for (int i1 = 0; i1 < labs1.length; i1++) {
        // typeAlphabet.lookupIndex(labs1[i1]);
        mf.register(REL, labs1[i1]);
      }

      String[] w = instance1.forms;
      for (int i1 = 0; i1 < w.length; i1++) {
        // saw the first time?
        if (mf.getValue(WORD, w[i1].toLowerCase()) == -1)
          opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]);

        mf.register(WORD, w[i1].toLowerCase());
      }
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]);

      w = instance1.lemmas;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]);
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase());

      w = instance1.plemmas;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]);
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase());

      for (int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]);

      w = instance1.ppos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      w = instance1.gpos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      for (int i1 = 1; i1 < w.length; i1++) {
        String op = getOperation(instance1, i1);
        if (ops.get(op) == null) ops.put(op, 1);
        else {
          ops.put(op, (ops.get(op) + 1));
          if (ops.get(op) > 4) rm.add(instance1.forms[i1].toLowerCase());
        }

        HashSet<String> forms = op2form.get(op);
        if (forms == null) {
          forms = new HashSet<String>();
          op2form.put(op, forms);
        }
        forms.add(instance1.forms[i1].toLowerCase());
      }
    }

    int countFreqSingleMappings = 0;

    int sc = 0;
    ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>();
    for (Entry<String, Integer> e : ops.entrySet()) {

      // do not use scripts for infrequent cases or frequent single mappings (der -> die)
      if (e.getValue() > _MIN_OCCURENT_FOR_SCRIPT_USE
          && op2form.get(e.getKey()).size() > _MIN_WORDS_MAPPED_BY_SCRIPT) {
        mf.register(OPERATION, e.getKey());
        sc++;
        opsl.add(e);
      } else {
        // do not remove the infrequent cases
        rm.removeAll(op2form.get(e.getKey()));

        if (op2form.get(e.getKey()).size() <= 1)
          countFreqSingleMappings += op2form.get(e.getKey()).size();
      }
    }
    for (String k : rm) {
      opse.remove(k);
    }

    Collections.sort(
        opsl,
        new Comparator<Entry<String, Integer>>() {

          @Override
          public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {

            return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1;
          }
        });

    for (Entry<String, Integer> e : opsl) {
      //	System.out.println(e.getKey()+"  "+e.getValue());
    }

    if (options.clusterFile == null) cl = new Cluster();
    else cl = new Cluster(options.clusterFile, mf, 6);

    System.out.println("\nfound scripts " + ops.size() + " used scripts " + sc);
    System.out.println("found mappings of single words " + countFreqSingleMappings);
    System.out.println("use word maps instead of scripts " + this.opse.size());
    //		System.out.println(" "+opse);
    System.out.println("" + mf.toString());

    initFeatures();

    mf.calculateBits();
    initValues();

    depReader.startReading(options.trainfile);

    int i = 0;
    long start1 = System.currentTimeMillis();

    System.out.print("Creating Features: ");
    is.init(ic, mf);
    del = 0;
    while (true) {
      try {
        if (i % 100 == 0) {
          del = outValue(i, del);
        }
        SentenceData09 instance1 = depReader.getNext(is);
        if (instance1 == null) break;

        is.fillChars(instance1, i, _CEND);

        if (i > options.count) break;

        i++;
      } catch (Exception e) {
        DB.println("error in sentnence " + i);
        e.printStackTrace();
      }
    }
    long end1 = System.currentTimeMillis();
    System.gc();
    long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
    System.out.print("  time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb");

    types = new String[mf.getFeatureCounter().get(OPERATION)];

    for (Entry<String, Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) {
      types[e.getValue()] = e.getKey();
      //	System.out.println("set pos "+e.getKey());
    }

    System.out.println("Num Features: " + mf.size());

    return is;
  }