/** * Retrieve all pages it is embedded in of a list of pages. * * @param wikipedia Wikipedia. * @param pageList List of pages. * @param namespaces List of name spaces to look into. * @param limit Flag indicating if the number of results should be limited. * @throws APIException */ @SuppressWarnings("unchecked") public List<Page> retrieveAllEmbeddedIn( EnumWikipedia wikipedia, List<Page> pageList, List<Integer> namespaces, boolean limit) throws APIException { if ((pageList == null) || (pageList.size() == 0)) { return null; } final API api = APIFactory.getAPI(); for (final Page page : pageList) { addTask(new EmbeddedInCallable(wikipedia, this, api, page, namespaces, limit)); } List<Page> resultList = new ArrayList<Page>(); while (hasRemainingTask() && !shouldStop()) { Object result = getNextResult(); if (result instanceof List<?>) { List<Page> pageResult = (List<Page>) result; for (Page page : pageResult) { resultList.add(page); } } } Collections.sort(resultList); Iterator<Page> itPage = resultList.iterator(); Page previousPage = null; while (itPage.hasNext()) { Page page = itPage.next(); if ((previousPage != null) && (Page.areSameTitle(previousPage.getTitle(), page.getTitle()))) { itPage.remove(); } else { previousPage = page; } } return resultList; }
/** * Update redirect and missing information of a list of pages. * * @param root Root element. * @param pages List of pages. * @throws JDOMException */ public void updateRedirect(Element root, Collection<Page> pages) throws JDOMException { // Retrieving redirects XPath xpaRedirects = XPath.newInstance("/api/query/redirects/r"); List listRedirects = xpaRedirects.selectNodes(root); XPath xpaFrom = XPath.newInstance("./@from"); XPath xpaTo = XPath.newInstance("./@to"); // Retrieving pages XPath xpaPages = XPath.newInstance("/api/query/pages"); Element listPages = (Element) xpaPages.selectSingleNode(root); XPath xpaPageId = XPath.newInstance("./@pageid"); XPath xpaNamespace = XPath.newInstance("./@ns"); XPath xpaTitle = XPath.newInstance("./@title"); // Retrieving normalization information Map<String, String> normalization = new HashMap<String, String>(); retrieveNormalization(root, normalization); // Analyzing redirects Iterator itRedirect = listRedirects.iterator(); while (itRedirect.hasNext()) { Element currentRedirect = (Element) itRedirect.next(); String fromPage = xpaFrom.valueOf(currentRedirect); String toPage = xpaTo.valueOf(currentRedirect); for (Page p : pages) { // Find if the redirect is already taken into account boolean exists = false; Iterator<Page> itPage = p.getRedirectIteratorWithPage(); while (itPage.hasNext()) { Page tmp = itPage.next(); String title = getNormalizedTitle(tmp.getTitle(), normalization); if (Page.areSameTitle(title, toPage)) { exists = true; } } // Add the redirect if needed itPage = p.getRedirectIteratorWithPage(); while (itPage.hasNext()) { Page tmp = itPage.next(); String title = getNormalizedTitle(tmp.getTitle(), normalization); if (!exists && Page.areSameTitle(title, fromPage)) { XPath xpaPage = createXPath("page", "title", toPage); List listTo = xpaPage.selectNodes(listPages); if (!listTo.isEmpty()) { Element to = (Element) listTo.get(0); Page pageTo = DataManager.getPage(p.getWikipedia(), xpaTitle.valueOf(to), null, null, null); pageTo.setNamespace(xpaNamespace.valueOf(to)); pageTo.setPageId(xpaPageId.valueOf(to)); p.addRedirect(pageTo); } } } } } // Analyzing missing pages for (Page p : pages) { Iterator<Page> itPage = p.getRedirectIteratorWithPage(); while (itPage.hasNext()) { Page tmp = itPage.next(); String title = getNormalizedTitle(tmp.getTitle(), normalization); XPath xpaPage = createXPath("page", "title", title); Element page = (Element) xpaPage.selectSingleNode(listPages); if (page != null) { List pageId = xpaPageId.selectNodes(page); if ((pageId != null) && (!pageId.isEmpty())) { tmp.setExisting(Boolean.TRUE); } else { Attribute attrMissing = page.getAttribute("missing"); if (attrMissing != null) { tmp.setExisting(Boolean.FALSE); } } } } } }