/** * Compile dictionary to letter transducers * * @param file the address of the XML dictionnary to be read * @param dir the direction of the compilation, 'lr' (leftSide-to-right) or 'rl' * (right-to-leftSide) */ public void parse(String file, String dir) { try { direction = dir; XMLInputFactory factory = XMLInputFactory.newInstance(); if (file.equals("-")) { reader = factory.createXMLStreamReader(System.in); } else { reader = factory.createXMLStreamReader(new FileInputStream(file)); } while (reader.hasNext()) { procNode(); reader.next(); } reader.close(); // Minimize transducers for (TransducerComp transducer : sections.values()) { transducer.minimize(); } } catch (FileNotFoundException e) { throw new RuntimeException("Error: Cannot open '" + file + "'."); } catch (RuntimeException e) { System.err.println("Error (" + e + ") at line " + reader.getLocation().getLineNumber()); throw e; } catch (Throwable ex) { System.err.println("Error (" + ex + ") at line " + reader.getLocation().getLineNumber()); throw new RuntimeException("Error t " + reader.getLocation().getLineNumber(), ex); } }
/** * Parse the <e> elements * * @throws javax.xml.stream.XMLStreamException */ private void procEntry() throws XMLStreamException { String attribute = attrib(COMPILER_RESTRICTION_ATTR); String ignore = attrib(COMPILER_IGNORE_ATTR); String name = ""; if (ignore.equals(COMPILER_IGNORE_YES_VAL) || (!(attribute.equals("")) && !(attribute.equals(direction)))) { while (!(name.equals(COMPILER_ENTRY_ELEM))) { reader.next(); if (reader.getEventType() == XMLStreamConstants.END_ELEMENT) { name = reader.getLocalName(); } } return; } ArrayList<EntryToken> elements = new ArrayList<EntryToken>(); while (true) { if (!reader.hasNext()) { throw new RuntimeException( "Error (" + reader.getLocation().getLineNumber() + "): Parse error."); } if (reader.getEventType() == XMLStreamConstants.START_ELEMENT) { name = reader.getLocalName(); } skipBlanks(); name = reader.getLocalName(); int type = reader.getEventType(); if (name.equals(COMPILER_PAIR_ELEM)) { elements.add(procTransduction()); } else if (name.equals(COMPILER_IDENTITY_ELEM)) { elements.add(procIdentity()); } else if (name.equals(COMPILER_FLAG_ELEM)) { elements.add(procFlag()); } else if (name.equals(COMPILER_REGEXP_ELEM)) { elements.add(procRegexp()); } else if (name.equals(COMPILER_PAR_ELEM)) { EntryToken par = procPar(); elements.add(par); // detection of the use of undefined paradigms String p = par.paradigmName; TransducerComp t = paradigms.get(p); if (t == null) { throw new RuntimeException("Error: Undefined paradigm '" + p + "'."); } // descartar entradas con paradigms vac�os (por las direciones, // normalmente if (t.isEmpty()) { while (!name.equals(COMPILER_ENTRY_ELEM) || type != XMLStreamConstants.END_ELEMENT) { reader.next(); if (reader.hasName()) { name = reader.getLocalName(); } type = reader.getEventType(); } return; } reader.next(); } else if (name.equals(COMPILER_ENTRY_ELEM) && type == XMLStreamConstants.END_ELEMENT) { // insertar elements into letter transducer insertEntryTokens(elements); reader.next(); return; } else if (reader.isWhiteSpace()) { } else if (allBlanks()) { if (!reader.hasText()) {} } else { throw new RuntimeException( "Error (" + reader.getLocation().getLineNumber() + "): Invalid inclusion of '<" + name + ">' into '<" + COMPILER_ENTRY_ELEM + ">'."); } } }
/** * Construct symbol pairs by align leftSide side of both parts and insert them into a transducer * * @param pi leftSide part of the transduction * @param pd right part of the transduction * @param state the state from wich insert the new transduction * @param t the transducer * @return the last state of the inserted transduction */ int matchTransduction(ArrayList<Integer> pi, ArrayList<Integer> pd, int state, TransducerComp t) { int izqda, dcha, limizqda, limdcha; if (direction.equals(COMPILER_RESTRICTION_LR_VAL)) { izqda = 0; dcha = 0; limizqda = pi.size(); limdcha = pd.size(); if (pi.size() == 0 && pd.size() == 0) { if (DEBUG) System.err.println("e = " + t.toString()); state = t.insertNewSingleTransduction(alphabet_cast00, state); } else { HashSet<Integer> acx_map_ptr = null; int rsymbol = 0; while (true) { int etiqueta; if (izqda == limizqda && dcha == limdcha) { break; } else if (izqda == limizqda) { etiqueta = alphabet.cast(0, pd.get(dcha)); dcha++; } else if (dcha == limdcha) { Integer pi_izqda = pi.get(izqda); etiqueta = alphabet.cast(pi_izqda, 0); acx_map_ptr = acx_map.get(pi_izqda); // perhaps null rsymbol = 0; izqda++; } else { Integer pi_izqda = pi.get(izqda); Integer pd_dcha = pd.get(dcha); etiqueta = alphabet.cast(pi_izqda, pd_dcha); acx_map_ptr = acx_map.get(pi_izqda); // perhaps null rsymbol = pd_dcha; izqda++; dcha++; } int nuevo_estado = t.insertSingleTransduction(etiqueta, state); if (acx_map_ptr != null) { for (Integer integer : acx_map_ptr) { t.linkStates(state, nuevo_estado, alphabet.cast(integer, rsymbol)); } } state = nuevo_estado; } } return state; } else { izqda = 0; dcha = 0; limizqda = pd.size(); limdcha = pi.size(); if (pi.size() == 0 && pd.size() == 0) { state = t.insertNewSingleTransduction(alphabet_cast00, state); } else { HashSet<Integer> acx_map_ptr = null; int rsymbol = 0; while (true) { int etiqueta; if (izqda == limizqda && dcha == limdcha) { break; } else if (izqda == limizqda) { etiqueta = alphabet.cast(0, pi.get(dcha)); dcha++; } else if (dcha == limdcha) { Integer pd_izqda = pd.get(izqda); etiqueta = alphabet.cast(pd_izqda, 0); acx_map_ptr = acx_map.get(pd_izqda); // perhaps null rsymbol = 0; izqda++; } else { Integer pd_izqda = pd.get(izqda); Integer pi_dcha = pi.get(dcha); etiqueta = alphabet.cast(pd_izqda, pi_dcha); acx_map_ptr = acx_map.get(pd_izqda); // perhaps null rsymbol = pi_dcha; izqda++; dcha++; } int nuevo_estado = t.insertSingleTransduction(etiqueta, state); if (acx_map_ptr != null) { for (Integer integer : acx_map_ptr) { t.linkStates(state, nuevo_estado, alphabet.cast(integer, rsymbol)); } } state = nuevo_estado; } } return state; } }
/** * Insert a list of tokens into the paradigm / section being processed * * @param elements the list */ private void insertEntryTokens(ArrayList<EntryToken> elements) { if (DEBUG) System.err.println("insertEntryTokens( " + elements); if (!current_paradigm.equals("")) { // compilation of paradigms TransducerComp t = paradigms.get(current_paradigm); if (t == null) { t = new TransducerComp(); paradigms.put(current_paradigm, t); } Integer e = t.getInitial(); for (int i = 0, limit = elements.size(); i < limit; i++) { EntryToken entry = elements.get(i); if (entry.isParadigm()) { if (!paradigms.containsKey(entry.paradigmName)) { paradigms.put(entry.paradigmName, new TransducerComp()); } e = t.insertTransducer(e, paradigms.get(entry.paradigmName)); } else if (entry.isSingleTransduction()) { e = matchTransduction(entry.leftSide, entry.rightSide, e, t); } else if (entry.isRegexp()) { RegexpCompiler analyzer = new RegexpCompiler(); analyzer.initialize(alphabet); analyzer.compile(entry.regexp); t.setEpsilon_Tag(alphabet_cast00); e = t.insertTransducer(e, analyzer.getTransducer()); } else { throw new RuntimeException( "Error (" + reader.getLocation().getLineNumber() + "): Invalid entry token."); } } t.setFinal(e); } else { // compilation of the dictionary TransducerComp t; if (!sections.containsKey(current_section)) { t = new TransducerComp(); sections.put(current_section, t); } else { t = sections.get(current_section); } int e = t.getInitial(); for (int i = 0, limit = elements.size(); i < limit; i++) { EntryToken entry = elements.get(i); if (entry.isParadigm()) { final String paradigmName = entry.paradigmName; if (i == elements.size() - 1) { // paradigm sufix if (!suffix_paradigms.containsKey(current_section)) { suffix_paradigms.put(current_section, new HashMap<String, Integer>()); } if (suffix_paradigms.get(current_section).containsKey(paradigmName)) { t.linkStates(e, suffix_paradigms.get(current_section).get(paradigmName), 0); e = postsuffix_paradigms.get(current_section).get(paradigmName); } else { e = t.insertNewSingleTransduction(alphabet_cast00, e); suffix_paradigms.get(current_section).put(paradigmName, e); t.setEpsilon_Tag(0); e = t.insertTransducer(e, paradigms.get(paradigmName)); if (!postsuffix_paradigms.containsKey(current_section)) { postsuffix_paradigms.put(current_section, new HashMap<String, Integer>()); } postsuffix_paradigms.get(current_section).put(paradigmName, e); } } else if (i == 0) { // paradigm prefix if (!prefix_paradigms.containsKey(current_section)) { prefix_paradigms.put(current_section, new HashMap<String, Integer>()); } if (prefix_paradigms.get(current_section).containsKey(paradigmName)) { e = prefix_paradigms.get(current_section).get(paradigmName); } else { t.setEpsilon_Tag(0); e = t.insertTransducer(e, paradigms.get(paradigmName)); prefix_paradigms.get(current_section).put(paradigmName, e); } } else { // paradigm intermediate if (!paradigms.containsKey(paradigmName)) { paradigms.put(paradigmName, new TransducerComp()); } t.setEpsilon_Tag(0); e = t.insertTransducer(e, paradigms.get(paradigmName)); } } else if (entry.isRegexp()) { RegexpCompiler analyzer = new RegexpCompiler(); analyzer.initialize(alphabet); analyzer.compile(entry.regexp); t.setEpsilon_Tag(alphabet_cast00); e = t.insertTransducer(e, analyzer.getTransducer()); } else { e = matchTransduction(entry.leftSide, entry.rightSide, e, t); } } t.setFinal(e); } }