private void writeRecordToFile(WikiPage wikiPage, int cnIndex) { // Simplest approach, prior to any data processing: // Just get the whole {{}}, and the previous text delimited by . and . String pageText = wikiPage.getWikiText(); try { // pageName // NOTE: titles seem to include a new line delimiter, so it is stripped. String pageName = wikiPage.getTitle().replace("\n", "").replace(",", "").trim(); int secondIndex = pageText.indexOf("}}", cnIndex); String tagContents = pageText.substring(cnIndex, secondIndex + 2); tagContents = tagContents.replace("\n", "").replace(",", "").trim(); int precedingStartingIndex = cnIndex - PRECEDING_SENTENCE_LENGTH; String precedingSentence; if (precedingStartingIndex < 0) { System.out.println( "cnIndex was " + cnIndex + " so I saved precedingSentence from 0 to cnIndex." + " The page was " + pageName + "."); precedingSentence = pageText.substring(0, cnIndex); } else { precedingSentence = pageText.substring(precedingStartingIndex, cnIndex); precedingSentence = precedingSentence.replace("\n", "").replace(",", "").trim(); } fileWriter.append(pageName); fileWriter.append(COMMA_DELIMITER); fileWriter.append(tagContents); fileWriter.append(COMMA_DELIMITER); fileWriter.append(precedingSentence); fileWriter.append(NEW_LINE_SEPARATOR); } catch (Exception e) { System.out.println( "Error! We caught an exception. cnIndex was " + cnIndex + " on page " + wikiPage.getTitle() + "."); // e.printStackTrace(); } }
public void process(WikiPage page) { // Increment total number of pages pagesTotal++; String text = page.getWikiText(); // Start at the beginning of the page. int currentIndex = 0; // Get the first index. If it's there, then we increment pagesWith. currentIndex = getNextCaseInsensitiveIndex(currentIndex, text); if (currentIndex != -1) pagesWith++; // Either start on the first index or skip if there aren't any. while (currentIndex != -1) { numTotal++; writeRecordToFile(page, currentIndex); currentIndex += ADVANCE_AMOUNT; // System.out.println("I found one and I'm asking for " + currentIndex); currentIndex = getNextCaseInsensitiveIndex(currentIndex, text); } }