예제 #1
0
  /**
   * Compile dictionary to letter transducers
   *
   * @param file the address of the XML dictionnary to be read
   * @param dir the direction of the compilation, 'lr' (leftSide-to-right) or 'rl'
   *     (right-to-leftSide)
   */
  public void parse(String file, String dir) {
    try {
      direction = dir;
      XMLInputFactory factory = XMLInputFactory.newInstance();
      if (file.equals("-")) {
        reader = factory.createXMLStreamReader(System.in);
      } else {
        reader = factory.createXMLStreamReader(new FileInputStream(file));
      }
      while (reader.hasNext()) {
        procNode();
        reader.next();
      }
      reader.close();
      // Minimize transducers
      for (TransducerComp transducer : sections.values()) {
        transducer.minimize();
      }

    } catch (FileNotFoundException e) {
      throw new RuntimeException("Error: Cannot open '" + file + "'.");
    } catch (RuntimeException e) {
      System.err.println("Error (" + e + ") at line " + reader.getLocation().getLineNumber());
      throw e;
    } catch (Throwable ex) {
      System.err.println("Error (" + ex + ") at line " + reader.getLocation().getLineNumber());
      throw new RuntimeException("Error t " + reader.getLocation().getLineNumber(), ex);
    }
  }
예제 #2
0
  /**
   * Parse the <e> elements
   *
   * @throws javax.xml.stream.XMLStreamException
   */
  private void procEntry() throws XMLStreamException {
    String attribute = attrib(COMPILER_RESTRICTION_ATTR);
    String ignore = attrib(COMPILER_IGNORE_ATTR);
    String name = "";
    if (ignore.equals(COMPILER_IGNORE_YES_VAL)
        || (!(attribute.equals("")) && !(attribute.equals(direction)))) {
      while (!(name.equals(COMPILER_ENTRY_ELEM))) {
        reader.next();
        if (reader.getEventType() == XMLStreamConstants.END_ELEMENT) {
          name = reader.getLocalName();
        }
      }
      return;
    }

    ArrayList<EntryToken> elements = new ArrayList<EntryToken>();

    while (true) {
      if (!reader.hasNext()) {
        throw new RuntimeException(
            "Error (" + reader.getLocation().getLineNumber() + "): Parse error.");
      }
      if (reader.getEventType() == XMLStreamConstants.START_ELEMENT) {
        name = reader.getLocalName();
      }
      skipBlanks();
      name = reader.getLocalName();
      int type = reader.getEventType();

      if (name.equals(COMPILER_PAIR_ELEM)) {
        elements.add(procTransduction());
      } else if (name.equals(COMPILER_IDENTITY_ELEM)) {
        elements.add(procIdentity());
      } else if (name.equals(COMPILER_FLAG_ELEM)) {
        elements.add(procFlag());
      } else if (name.equals(COMPILER_REGEXP_ELEM)) {
        elements.add(procRegexp());
      } else if (name.equals(COMPILER_PAR_ELEM)) {
        EntryToken par = procPar();
        elements.add(par);

        // detection of the use of undefined paradigms
        String p = par.paradigmName;
        TransducerComp t = paradigms.get(p);
        if (t == null) {
          throw new RuntimeException("Error: Undefined paradigm '" + p + "'.");
        }
        // descartar entradas con paradigms vac�os (por las direciones,
        // normalmente
        if (t.isEmpty()) {
          while (!name.equals(COMPILER_ENTRY_ELEM) || type != XMLStreamConstants.END_ELEMENT) {
            reader.next();
            if (reader.hasName()) {
              name = reader.getLocalName();
            }
            type = reader.getEventType();
          }
          return;
        }
        reader.next();
      } else if (name.equals(COMPILER_ENTRY_ELEM) && type == XMLStreamConstants.END_ELEMENT) {
        // insertar elements into letter transducer
        insertEntryTokens(elements);
        reader.next();
        return;
      } else if (reader.isWhiteSpace()) {
      } else if (allBlanks()) {
        if (!reader.hasText()) {}
      } else {
        throw new RuntimeException(
            "Error ("
                + reader.getLocation().getLineNumber()
                + "): Invalid inclusion of '<"
                + name
                + ">' into '<"
                + COMPILER_ENTRY_ELEM
                + ">'.");
      }
    }
  }
예제 #3
0
  /**
   * Construct symbol pairs by align leftSide side of both parts and insert them into a transducer
   *
   * @param pi leftSide part of the transduction
   * @param pd right part of the transduction
   * @param state the state from wich insert the new transduction
   * @param t the transducer
   * @return the last state of the inserted transduction
   */
  int matchTransduction(ArrayList<Integer> pi, ArrayList<Integer> pd, int state, TransducerComp t) {
    int izqda, dcha, limizqda, limdcha;
    if (direction.equals(COMPILER_RESTRICTION_LR_VAL)) {
      izqda = 0;
      dcha = 0;
      limizqda = pi.size();
      limdcha = pd.size();

      if (pi.size() == 0 && pd.size() == 0) {
        if (DEBUG) System.err.println("e = " + t.toString());
        state = t.insertNewSingleTransduction(alphabet_cast00, state);
      } else {
        HashSet<Integer> acx_map_ptr = null;
        int rsymbol = 0;

        while (true) {
          int etiqueta;
          if (izqda == limizqda && dcha == limdcha) {
            break;
          } else if (izqda == limizqda) {
            etiqueta = alphabet.cast(0, pd.get(dcha));
            dcha++;
          } else if (dcha == limdcha) {
            Integer pi_izqda = pi.get(izqda);
            etiqueta = alphabet.cast(pi_izqda, 0);
            acx_map_ptr = acx_map.get(pi_izqda); // perhaps null
            rsymbol = 0;
            izqda++;
          } else {
            Integer pi_izqda = pi.get(izqda);
            Integer pd_dcha = pd.get(dcha);
            etiqueta = alphabet.cast(pi_izqda, pd_dcha);
            acx_map_ptr = acx_map.get(pi_izqda); // perhaps null
            rsymbol = pd_dcha;
            izqda++;
            dcha++;
          }

          int nuevo_estado = t.insertSingleTransduction(etiqueta, state);
          if (acx_map_ptr != null) {
            for (Integer integer : acx_map_ptr) {
              t.linkStates(state, nuevo_estado, alphabet.cast(integer, rsymbol));
            }
          }
          state = nuevo_estado;
        }
      }
      return state;

    } else {
      izqda = 0;
      dcha = 0;
      limizqda = pd.size();
      limdcha = pi.size();

      if (pi.size() == 0 && pd.size() == 0) {
        state = t.insertNewSingleTransduction(alphabet_cast00, state);
      } else {
        HashSet<Integer> acx_map_ptr = null;
        int rsymbol = 0;

        while (true) {
          int etiqueta;
          if (izqda == limizqda && dcha == limdcha) {
            break;
          } else if (izqda == limizqda) {
            etiqueta = alphabet.cast(0, pi.get(dcha));
            dcha++;
          } else if (dcha == limdcha) {
            Integer pd_izqda = pd.get(izqda);
            etiqueta = alphabet.cast(pd_izqda, 0);
            acx_map_ptr = acx_map.get(pd_izqda); // perhaps null
            rsymbol = 0;
            izqda++;
          } else {
            Integer pd_izqda = pd.get(izqda);
            Integer pi_dcha = pi.get(dcha);
            etiqueta = alphabet.cast(pd_izqda, pi_dcha);
            acx_map_ptr = acx_map.get(pd_izqda); // perhaps null
            rsymbol = pi_dcha;
            izqda++;
            dcha++;
          }

          int nuevo_estado = t.insertSingleTransduction(etiqueta, state);
          if (acx_map_ptr != null) {
            for (Integer integer : acx_map_ptr) {
              t.linkStates(state, nuevo_estado, alphabet.cast(integer, rsymbol));
            }
          }
          state = nuevo_estado;
        }
      }
      return state;
    }
  }
예제 #4
0
  /**
   * Insert a list of tokens into the paradigm / section being processed
   *
   * @param elements the list
   */
  private void insertEntryTokens(ArrayList<EntryToken> elements) {

    if (DEBUG) System.err.println("insertEntryTokens( " + elements);
    if (!current_paradigm.equals("")) {
      // compilation of paradigms
      TransducerComp t = paradigms.get(current_paradigm);
      if (t == null) {
        t = new TransducerComp();
        paradigms.put(current_paradigm, t);
      }

      Integer e = t.getInitial();

      for (int i = 0, limit = elements.size(); i < limit; i++) {
        EntryToken entry = elements.get(i);

        if (entry.isParadigm()) {
          if (!paradigms.containsKey(entry.paradigmName)) {
            paradigms.put(entry.paradigmName, new TransducerComp());
          }
          e = t.insertTransducer(e, paradigms.get(entry.paradigmName));
        } else if (entry.isSingleTransduction()) {
          e = matchTransduction(entry.leftSide, entry.rightSide, e, t);
        } else if (entry.isRegexp()) {
          RegexpCompiler analyzer = new RegexpCompiler();
          analyzer.initialize(alphabet);
          analyzer.compile(entry.regexp);
          t.setEpsilon_Tag(alphabet_cast00);
          e = t.insertTransducer(e, analyzer.getTransducer());
        } else {
          throw new RuntimeException(
              "Error (" + reader.getLocation().getLineNumber() + "): Invalid entry token.");
        }
      }
      t.setFinal(e);
    } else {
      // compilation of the dictionary
      TransducerComp t;
      if (!sections.containsKey(current_section)) {
        t = new TransducerComp();
        sections.put(current_section, t);
      } else {
        t = sections.get(current_section);
      }
      int e = t.getInitial();

      for (int i = 0, limit = elements.size(); i < limit; i++) {
        EntryToken entry = elements.get(i);
        if (entry.isParadigm()) {
          final String paradigmName = entry.paradigmName;
          if (i == elements.size() - 1) {
            // paradigm sufix
            if (!suffix_paradigms.containsKey(current_section)) {
              suffix_paradigms.put(current_section, new HashMap<String, Integer>());
            }
            if (suffix_paradigms.get(current_section).containsKey(paradigmName)) {
              t.linkStates(e, suffix_paradigms.get(current_section).get(paradigmName), 0);
              e = postsuffix_paradigms.get(current_section).get(paradigmName);
            } else {
              e = t.insertNewSingleTransduction(alphabet_cast00, e);
              suffix_paradigms.get(current_section).put(paradigmName, e);
              t.setEpsilon_Tag(0);
              e = t.insertTransducer(e, paradigms.get(paradigmName));
              if (!postsuffix_paradigms.containsKey(current_section)) {
                postsuffix_paradigms.put(current_section, new HashMap<String, Integer>());
              }
              postsuffix_paradigms.get(current_section).put(paradigmName, e);
            }
          } else if (i == 0) {
            // paradigm prefix
            if (!prefix_paradigms.containsKey(current_section)) {
              prefix_paradigms.put(current_section, new HashMap<String, Integer>());
            }
            if (prefix_paradigms.get(current_section).containsKey(paradigmName)) {
              e = prefix_paradigms.get(current_section).get(paradigmName);
            } else {
              t.setEpsilon_Tag(0);
              e = t.insertTransducer(e, paradigms.get(paradigmName));
              prefix_paradigms.get(current_section).put(paradigmName, e);
            }
          } else {
            // paradigm intermediate
            if (!paradigms.containsKey(paradigmName)) {
              paradigms.put(paradigmName, new TransducerComp());
            }
            t.setEpsilon_Tag(0);
            e = t.insertTransducer(e, paradigms.get(paradigmName));
          }
        } else if (entry.isRegexp()) {
          RegexpCompiler analyzer = new RegexpCompiler();
          analyzer.initialize(alphabet);
          analyzer.compile(entry.regexp);
          t.setEpsilon_Tag(alphabet_cast00);
          e = t.insertTransducer(e, analyzer.getTransducer());
        } else {
          e = matchTransduction(entry.leftSide, entry.rightSide, e, t);
        }
      }
      t.setFinal(e);
    }
  }