/** * Extract results from a labelled full text in the training format without any string * modification. * * @param result reult * @param tokenizations toks * @return extraction */ private StringBuffer trainingExtraction(String result, ArrayList<String> tokenizations) { // this is the main buffer for the whole full text StringBuffer buffer = new StringBuffer(); try { StringTokenizer st = new StringTokenizer(result, "\n"); String s1 = null; String s2 = null; String lastTag = null; // current token position int p = 0; boolean start = true; boolean openFigure = false; boolean headFigure = false; boolean descFigure = false; boolean tableBlock = false; while (st.hasMoreTokens()) { boolean addSpace = false; String tok = st.nextToken().trim(); if (tok.length() == 0) { continue; } StringTokenizer stt = new StringTokenizer(tok, " \t"); ArrayList<String> localFeatures = new ArrayList<String>(); int i = 0; boolean newLine = false; int ll = stt.countTokens(); while (stt.hasMoreTokens()) { String s = stt.nextToken().trim(); if (i == 0) { s2 = TextUtilities.HTMLEncode(s); // lexical token boolean strop = false; while ((!strop) && (p < tokenizations.size())) { String tokOriginal = tokenizations.get(p); if (tokOriginal.equals(" ")) { addSpace = true; } else if (tokOriginal.equals(s)) { strop = true; } p++; } } else if (i == ll - 1) { s1 = s; // current tag } else { if (s.equals("LINESTART")) newLine = true; localFeatures.add(s); } i++; } if (newLine && !start) { buffer.append("<lb/>"); } String lastTag0 = null; if (lastTag != null) { if (lastTag.startsWith("I-")) { lastTag0 = lastTag.substring(2, lastTag.length()); } else { lastTag0 = lastTag; } } String currentTag0 = null; if (s1 != null) { if (s1.startsWith("I-")) { currentTag0 = s1.substring(2, s1.length()); } else { currentTag0 = s1; } } boolean closeParagraph = false; if (lastTag != null) { closeParagraph = testClosingTag(buffer, currentTag0, lastTag0, s1); } boolean output = false; if (!currentTag0.equals("<table>") && !currentTag0.equals("<trash>") && !currentTag0.equals("<figure_head>") && !currentTag0.equals("<label>")) { if (openFigure) { buffer.append("\n\t\t\t</figure>\n\n"); } openFigure = false; headFigure = false; descFigure = false; tableBlock = false; } output = writeField(buffer, s1, lastTag0, s2, "<header>", "<front>", addSpace, 3); if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<other>", "<note type=\"other\">", addSpace, 3); } // for paragraph we must distinguish starting and closing tags if (!output) { if (closeParagraph) { output = writeFieldBeginEnd(buffer, s1, "", s2, "<paragraph>", "<p>", addSpace, 3); } else { output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<paragraph>", "<p>", addSpace, 3); } } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<page_header>", "<note place=\"headnote\">", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<page_footnote>", "<note place=\"footnote\">", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<page>", "<page>", addSpace, 3); } if (!output) { output = writeFieldBeginEnd(buffer, s1, lastTag0, s2, "<reference>", "<bibl>", addSpace, 3); } if (!output) { if (closeParagraph) { output = writeField(buffer, s1, "", s2, "<reference_marker>", "<label>", addSpace, 3); } else output = writeField(buffer, s1, lastTag0, s2, "<reference_marker>", "<label>", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<citation_marker>", "<ref type=\"biblio\">", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<section>", "<head>", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<subsection>", "<head>", addSpace, 3); } if (!output) { if (openFigure) { output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 4); } else { // output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<figure>\n\t\t\t\t<trash>", output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 3); if (output) { openFigure = true; } } } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<equation>", "<formula>", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<figure_marker>", "<ref type=\"figure\">", addSpace, 3); } if (!output) { if (openFigure) { if (tableBlock && (!lastTag0.equals("<table>")) && (currentTag0.equals("<table>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<figure>\n\t\t\t\t<table>", "<figure>", addSpace, 3); if (output) { tableBlock = true; descFigure = false; headFigure = false; } } else { output = writeField(buffer, s1, lastTag0, s2, "<table>", "<table>", addSpace, 4); if (output) { tableBlock = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<table>", "<figure>\n\t\t\t\t<table>", addSpace, 3); if (output) { openFigure = true; tableBlock = true; } } } if (!output) { if (openFigure) { if (descFigure && (!lastTag0.equals("<label>")) && (currentTag0.equals("<label>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<label>", "<figure>\n\t\t\t\t<figDesc>", addSpace, 3); if (output) { descFigure = true; tableBlock = false; headFigure = false; } } else { output = writeField(buffer, s1, lastTag0, s2, "<label>", "<figDesc>", addSpace, 4); if (output) { descFigure = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<label>", "<figure>\n\t\t\t\t<figDesc>", addSpace, 3); if (output) { openFigure = true; descFigure = true; } } } if (!output) { if (openFigure) { if (headFigure && (!lastTag0.equals("<figure_head>")) && (currentTag0.equals("<figure_head>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<figure_head>", "<figure>\n\t\t\t\t<head>", addSpace, 3); if (output) { descFigure = false; tableBlock = false; headFigure = true; } } else { output = writeField(buffer, s1, lastTag0, s2, "<figure_head>", "<head>", addSpace, 4); if (output) { headFigure = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<figure_head>", "<figure>\n\t\t\t\t<head>", addSpace, 3); if (output) { openFigure = true; headFigure = true; } } } // for item we must distinguish starting and closing tags if (!output) { output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<item>", "<item>", addSpace, 3); } lastTag = s1; if (!st.hasMoreTokens()) { if (lastTag != null) { testClosingTag(buffer, "", currentTag0, s1); } if (openFigure) { buffer.append("\n\t\t\t</figure>\n\n"); } } if (start) { start = false; } } return buffer; } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } }