/** * @param linkElement * @return whether the current link element is an image link */ protected boolean isImageLink(Element linkElement) { if (linkElement.children().isEmpty() || linkElement.children().size() > 1 || StringUtils.isNotBlank(linkElement.ownText())) { return false; } return !linkElement.children().select(IMAGE_LINK_CHILDREN_CSS_LIKE_QUERY).isEmpty(); }
/** * @param linkElement * @return whether the current link element is a svg link */ protected boolean isSvgLink(Element linkElement) { if (linkElement.children().isEmpty() || linkElement.children().size() > 1 || StringUtils.isNotBlank(linkElement.ownText())) { return false; } return !linkElement.children().select(HtmlElementStore.SVG_ELEMENT).isEmpty(); }
private void recurse(final Element element, final Map<String, Object> values, final int depth) { final Tag tag = element.tag(); final Set<String> classes = element.classNames(); final String link = element.attr("href"); final Object content = extractChildContent(element); if (!classes.isEmpty()) { removeEmpty(classes); // toplevel classes define type if (tag.isBlock()) { if (depth == 0) { // store type attribute values.put("type", classes); for (final Element child : element.children()) { recurse(child, values, depth + 1); } } else { final Map<String, Object> childMap = new LinkedHashMap<>(); values.put(classes.iterator().next(), childMap); if (content != null) { childMap.put("name", content); } for (final Element child : element.children()) { recurse(child, childMap, depth + 1); } } } else if (tag.isInline()) { // extract href and store as URL if (classes.contains("url") && StringUtils.isNotBlank(link)) { values.put("url", link); classes.remove("url"); } if (content != null) { for (final String type : classes) { values.put(type, content); } } } } }
// Busca os endereços pelo número do CEP. public List<Address> getByCep(String cep) throws IOException { listEnderecos = new ArrayList<Address>(); // mapeamento dos parametros que será passado na requisição Map<String, String> query = new HashMap<String, String>(); query.put("CEP", cep); query.put("Metodo", "listaLogradouro"); query.put("TipoConsulta", "cep"); query.put("StartRow", "1"); query.put("EndRow", "10"); // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros // mapeados, // requisição deverá ser do tipo post. // Armazena o retorno em uma variavel doc. Document doc = Jsoup.connect(Utils.adressCorreios) .data(query) .header("Origin", "http://www.buscacep.correios.com.br") .header("Referer", "http://www.buscacep.correios.com.br") .post(); // Acessa o retorno do doc e percorre o resultado buscando as informações dos endereços // Armazena os resultados na lista de endereços criadas e retorna a mesma para que outras // classes possam acessar. Elements elements = doc.select("table").eq(2); Elements rows = elements.select("tr"); Iterator<Element> rowIterator = rows.iterator(); while (rowIterator.hasNext()) { Address enderecos = new Address(); Element element = rowIterator.next(); Elements logradouro = element.children().select("td").eq(0); enderecos.setLogradouro(logradouro.text()); Elements bairro = element.children().select("td").eq(1); enderecos.setBairro(bairro.text()); Elements cidade = element.children().select("td").eq(2); Elements estado = element.children().select("td").eq(3); StringBuilder sbLocalidade = new StringBuilder(); sbLocalidade.append(cidade.text()); sbLocalidade.append("/"); sbLocalidade.append(estado.text()); enderecos.setLocalidade(sbLocalidade.toString()); Elements codigopostal = element.children().select("td").eq(4); enderecos.setCEP(codigopostal.text()); listEnderecos.add(enderecos); } return listEnderecos; }
/** * Reads an Item from a design and inserts it into the data source. Recursively handles any * children of the item as well. * * @since 7.5.0 * @param node an element representing the item (tree node). * @param selected A set accumulating selected items. If the item that is read is marked as * selected, its item id should be added to this set. * @param context the DesignContext instance used in parsing * @return the item id of the new item * @throws DesignException if the tag name of the {@code node} element is not {@code node}. */ @Override protected String readItem(Element node, Set<String> selected, DesignContext context) { if (!"node".equals(node.tagName())) { throw new DesignException( "Unrecognized child element in " + getClass().getSimpleName() + ": " + node.tagName()); } String itemId = node.attr("text"); addItem(itemId); if (node.hasAttr("icon")) { Resource icon = DesignAttributeHandler.readAttribute("icon", node.attributes(), Resource.class); setItemIcon(itemId, icon); } if (node.hasAttr("selected")) { selected.add(itemId); } for (Element child : node.children()) { String childItemId = readItem(child, selected, context); setParent(childItemId, itemId); } return itemId; }
/** * 解析数据,默认解析第一列 * * @param rows 源数据集 * @return 节目数据 */ private static String[][] parseRows(Elements rows) { String[][] programs = new String[rows.size()][2]; int rowspan_0 = 0; int rowspan_1 = 0; for (int i = 0; i < rows.size(); i++) { Element row = rows.get(i); try { Elements cells = row.children(); if (rowspan_0 == 0) { Element cell_0 = cells.get(0); rowspan_0 = Integer.valueOf(cell_0.attr("rowspan")); if (rowspan_1 == 0) { Element cell_1 = cells.get(1); rowspan_1 = Integer.valueOf(cell_1.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text()); } } else if (rowspan_1 == 0) { Element cell_0 = cells.get(0); rowspan_1 = Integer.valueOf(cell_0.attr("rowspan")); programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text()); programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text()); } rowspan_0--; rowspan_1--; } catch (Exception e) { e.printStackTrace(System.out); } } return programs; }
public List<AreaVO> parseMessage(String text, int pid) { Document doc = Jsoup.parse(text); Element body = doc.body(); List<AreaVO> areas = new ArrayList<AreaVO>(); Elements divs = body.getElementsByClass("subarea"); if (divs.size() > 0) { Element div = divs.get(0); Elements childs = div.children(); String letter = ""; for (int i = 1; i < childs.size(); i++) { Element child = childs.get(i); if ("b".equals(child.tagName())) { letter = child.text(); continue; } if ("a".equals(child.tagName())) { AreaVO area = new AreaVO(); area.setLetter(letter); area.setName(child.text()); area.setOrderIdx(index); area.setPid(pid); String href = child.attr("href"); String pinyin = href.substring(7, href.lastIndexOf("/")); area.setPinyin(pinyin); index++; System.out.println(area.toString()); areas.add(area); } } } return areas; }
public Holder doParse(String html, String url) { Holder holder = new Holder(); holder.url = url; Document doc = Jsoup.parse(html, url); Elements typeElement = doc.select("body > div.main_w.clearfix > div.main.clearfix > ul > li:nth-child(5) > a"); holder.dishType = typeElement.text(); Elements titleElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info1 > h1 > a"); holder.title = titleElement.text(); Elements methodElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li:nth-child(1) > a"); holder.method = methodElement.text(); Elements materialElement = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a"); holder.mainMaterial = materialElement.text(); Elements stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div.content.clearfix"); // // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div.content.clearfix // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.edit > p:nth-child(1) > em // if (stepE.size() == 0) { stepE = doc.select( "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.edit > p"); } for (int i = 0; i < stepE.size(); i++) { Element e = stepE.get(i); if (e.children().hasClass("step")) { String step = e.text(); if (!"".equals(step)) { holder.steps.add(step); } } } // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix // > div.cp_body_left > div.measure > div.editnew.edit > // div:nth-child(1) return holder; }
@Test public void createsDocumentStructure() { String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>"; Document doc = Jsoup.parse(html); Element head = doc.getHead(); Element body = doc.getBody(); assertEquals(2, doc.children().size()); assertEquals(3, head.children().size()); assertEquals(1, body.children().size()); assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name")); assertEquals(0, body.getElementsByTag("meta").size()); assertEquals("jsoup", doc.getTitle()); assertEquals("Hello world", body.text()); assertEquals("Hello world", body.children().get(0).text()); }
// recursively processes the element to replace <br>'s with \n private void fixLineBreaks(Element el) { for (final Element e : el.children()) { if (e.tagName().equals("br")) { e.before("\n"); e.remove(); } else { fixLineBreaks(e); } } }
/** * Constructs a component hierarchy from the design specified as an html tree. * * <p>If a component root is given, the component instances created during reading the design are * assigned to its member fields based on their id, local id, and caption * * @param doc the html tree * @param componentRoot optional component root instance. The type must match the type of the root * element in the design. * @param classWithFields a class (componentRoot class or a super class) with some member fields. * The member fields whose type is assignable from {@link Component} are bound to fields in * the design based on id/local id/caption */ private static DesignContext designToComponentTree( Document doc, Component componentRoot, Class<?> classWithFields) { DesignContext designContext = new DesignContext(doc); designContext.readPackageMappings(doc); // No special handling for a document without a body element - should be // taken care of by jsoup. Element root = doc.body(); Elements children = root.children(); if (children.size() > 1) { throw new DesignException( "The first level of a component hierarchy should contain at most one root component, but found " + children.size() + "."); } Element element = children.size() == 0 ? null : children.first(); if (componentRoot != null) { if (element == null) { throw new DesignException( "The root element cannot be null when the specified root Component is" + " not null."); } // user has specified root instance that may have member fields that // should be bound final FieldBinder binder; try { binder = new FieldBinder(componentRoot, classWithFields); } catch (IntrospectionException e) { throw new DesignException("Could not bind fields of the root component", e); } // create listener for component creations that binds the created // components to the componentRoot instance fields ComponentCreationListener creationListener = new ComponentCreationListener() { @Override public void componentCreated(ComponentCreatedEvent event) { binder.bindField(event.getComponent(), event.getLocalId()); } }; designContext.addComponentCreationListener(creationListener); // create subtree designContext.readDesign(element, componentRoot); // make sure that all the member fields are bound Collection<String> unboundFields = binder.getUnboundFields(); if (!unboundFields.isEmpty()) { throw new DesignException("Found unbound fields from component root " + unboundFields); } // no need to listen anymore designContext.removeComponentCreationListener(creationListener); } else { // createChild creates the entire component hierarchy componentRoot = element == null ? null : designContext.readDesign(element); } designContext.setRootComponent(componentRoot); return designContext; }
private boolean hasValidHeader() { Elements titleElement = getDoc().getElementsByTag("h1"); if (titleElement.size() == 0) return false; for (Element element : titleElement) { if (element.children().size() == 0) continue; Pattern pattern = Pattern.compile("[(](\\d){4}\u2013 [)]"); Matcher matcher = pattern.matcher(element.children().first().text()); if (!matcher.matches()) continue; setYear(matcher.group(0).substring(1, 5)); setTitle( element.text().substring(0, element.text().length() - matcher.group(0).length()).trim()); return true; } return false; }
private static void parseReplyCount(Topic.Builder topicBuilder, Element ele) { final Elements children = ele.children(); final int count; if (children.size() > 0) { final String numStr = ele.child(0).text(); count = Integer.parseInt(numStr); } else { // do not have reply yet count = 0; } topicBuilder.setReplyCount(count); }
private void getChildElement(Element parentElement, Integer level) { parentElement.html(deleteComent(parentElement.html())); // System.out.println("key:"+(level+","+parentElement.hashCode())+",value:"+parentElement.html()); if (parentElement.children().size() > 0) { level += 1; for (int i = 0; i < parentElement.children().size(); i++) { if (("ul".equals(parentElement.tagName().toLowerCase())) || ("table".equals(parentElement.tagName().toLowerCase()))) { // 整体标签 String html = parentElement.html().replaceAll(" ", "").replaceAll(" ", ""); // 去中英文空格 if (html.contains("首页") || parentElement.id().contains("nav")) { // // System.out.println("----------------------首页Start-----------------------------"); Elements links = parentElement.select("a"); for (Element ele : links) { if (topMenumap.get(level + "," + ele.hashCode()) == null) { topMenumap.put(level + "," + ele.hashCode(), ele); // System.out.println(level + "," + ele.hashCode() + ",---------------" + // ele.html()); // System.out.println("a:" + // ele.attr("abs:href") + ",文本:" + ele.text()); } } // System.out.println("----------------------首页End-----------------------------"); } else { map.put(level + "," + parentElement.hashCode(), parentElement); } } else { getChildElement(parentElement.child(i), level); } } } else { if ("script".equals(parentElement.tagName().toLowerCase())) { return; } if (StringUtils.isNotEmpty(parentElement.html())) { level += 1; map.put(level + "," + parentElement.hashCode(), parentElement); } } }
public ForumUser getUserData() { ForumUser fUser = new ForumUser(); fUser.userDefinedInfo = new HashMap<String, String>(); Element temp = mainUserInfo.select("h1").get(0); System.out.println(temp.text()); if (!temp.text().equals("")) { fUser.userName = temp.text(); } fUser.avatarUri = "www.forum.hr/" + doc.getElementById("user_avatar").attr("src"); temp = mainUserInfo.select("div[id=last_online]").get(0); if (temp.text() != null) { fUser.userLastActivity = temp.text(); } int i = 0; for (Element el : userMiniStats.children()) { System.out.println(el); System.out.println("______________________________"); // check only evens for keys (on odds are values) if (i % 2 == 0) { temp = userMiniStats.children().get(i); if (!temp.text().equals("")) fUser.userDefinedInfo.put(temp.text(), userMiniStats.children().get(i + 1).text()); } i += 1; } System.out.println("USername: "******"Last activity: " + fUser.userLastActivity); System.out.println("Avatar URI: " + fUser.avatarUri); System.out.println("Other info: " + fUser.userDefinedInfo); return fUser; }
@Override public LinkedList<Element> getAllIndexesRows() { Document doc = getDocument(); LinkedList<Element> indexes = new LinkedList<Element>(); Element tbody = doc.getElementById("OtherIndicesTable").child(1); for (Element tr : tbody.children()) { indexes.add(tr); } return indexes; }
@Test public void parsesUnterminatedTag() { String h1 = "<p"; Document doc = Jsoup.parse(h1); assertEquals(1, doc.getElementsByTag("p").size()); String h2 = "<div id=1<p id='2'"; doc = Jsoup.parse(h2); Element d = doc.getElementById("1"); assertEquals(1, d.children().size()); Element p = doc.getElementById("2"); assertNotNull(p); }
// Busca o Cep pelo logradouro. public List<String> getByAdress(String address) throws IOException { listAddress = new ArrayList<String>(); // mapeamento dos parametros que será passado na requisição Map<String, String> query = new HashMap<String, String>(); query.put("relaxation", address); query.put("TipoCep", "ALL"); query.put("semelhante", "N"); query.put("cfm", "1"); query.put("Metodo", "listaLogradouro"); query.put("TipoConsulta", "relaxation"); query.put("StartRow", "1"); query.put("EndRow", "10"); // Faz uma requisição no site do correios (www.buscacep.com.br) com Json, passando os parametros // mapeados, // requisição deverá ser do tipo post. // Armazena o retorno em uma variavel doc. Document doc = Jsoup.connect(Utils.adressCorreios) .timeout(20000) .data(query) .header("Origin", "http://www.buscacep.correios.com.br") .header("Referer", "http://www.buscacep.correios.com.br") .post(); // Acessa o retorno do doc e percorre o resultado buscando as informações de Cep de acordo com o // endereço passado. // Armazena os resultados na lista criada e retorna a mesma para que outras classes possam // acessar Elements elements = doc.select("table").eq(2); Elements rows = elements.select("tr"); Iterator<Element> rowIterator = rows.iterator(); while (rowIterator.hasNext()) { Address enderecos = new Address(); Element element = rowIterator.next(); Elements codigopostal = element.children().select("td").eq(4); enderecos.setCEP(codigopostal.text()); listAddress.add(enderecos.getCEP()); } return listAddress; }
private static Topic parseItemForNode(Element item, Node node) { final Elements list = item.children(); final Topic.Builder topicBuilder = new Topic.Builder(); parseMember(topicBuilder, list.get(0)); final Element ele = list.get(2); parseTitle(topicBuilder, ele); parseInfo(topicBuilder, ele, node); parseReplyCount(topicBuilder, list.get(3)); return topicBuilder.createTopic(); }
@Test public void testChildThrowsIndexOutOfBoundsOnMissing() { Document doc = Jsoup.parse("<div><p>One</p><p>Two</p></div>"); Element div = doc.select("div").first(); assertEquals(2, div.children().size()); assertEquals("One", div.child(0).text()); try { div.child(3); fail("Should throw index out of bounds"); } catch (IndexOutOfBoundsException e) { } }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
public static String printNode(Element root, int indentation) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < indentation; i++) { sb.append(' '); } sb.append(root.tagName()); sb.append(":"); sb.append(root.ownText()); sb.append("\n"); for (Element el : root.children()) { sb.append(printNode(el, indentation + 1)); sb.append("\n"); } return sb.toString(); }
public int walk(Element el) { Elements children = el.children(); String tagName = el.tagName().toLowerCase(); if (tagName.matches("h[1-6]")) { secIndex++; String secName = el.text(); String key = "SectionTitle" + "/" + title + "/" + hash + "/" + new Integer(secIndex).toString(); String value = secNameFilter(secName); dataStore.put(key, value); } for (Element child : children) { secIndex = (new Walker(secIndex)).walk(child); } return secIndex; }
private static void alterElement(Element e) { org.jsoup.select.Elements s = e.children(); Iterator<Element> ele = s.iterator(); int i = 0; while (ele.hasNext()) { Element r = ele.next(); if (!r.tag().getName().equals("p")) { r.tagName("p"); // plain replace // Element rtemp = r.clone(); // Element ep = new Element(Tag.valueOf("p"), ""); // ep.appendChild(rtemp); // r.replaceWith(ep); // StringBuffer bf = new StringBuffer(); // bf.append("<k>").append(r.toString()).append("</k>"); // r.html(bf.toString()); // System.out.println(r.tagName()); } i++; } }
@SuppressWarnings("unused") private void getHtml(String url, int groupPos) { Document doc; try { List<MissionItemData> missionsList = new ArrayList<>(); // need http protocol doc = Jsoup.connect(url).userAgent("Mozilla").get(); Elements elementsByClass = doc.getElementsByClass("asset-abstract"); int id = 0; for (int i = 0; i < elementsByClass.size(); i++) { Element el = elementsByClass.get(i); if (el.children().hasClass("asset-more")) { Element inCon = el.getElementsByClass("asset-content").first().select("a").first(); String name = inCon.text(); String hrefLink = inCon.attr("href"); Element imgEl = el.getElementsByClass("asset-abstract-imgLink").first().select("img").first(); String imgLink = imgEl.attr("src"); String content = el.html(); MissionItemData itemMission = new MissionItemData(id, name, hrefLink, imgLink, content); missionsList.add(itemMission); id = id + 1; } } listDataChild.put(listDataHeader.get(groupPos).getName(), missionsList); } catch (IOException e) { e.printStackTrace(); } }
@Override protected void onPostExecute(Document document) { super.onPostExecute(document); story_pull_list.onRefreshComplete(); File file = new File(Environment.getExternalStorageDirectory() + "/Latest_qbaobei.txt"); try { if (!file.exists()) { file.createNewFile(); } String str = document.toString(); FileWriter writer = new FileWriter(file.getAbsolutePath()); BufferedWriter bufferedWriter = new BufferedWriter(writer); bufferedWriter.write(str); bufferedWriter.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (document == null) { ToastAlone.show(R.string.load_fail_hint); return; } Element page = document.select("div.page").first(); if (page != null) { Elements children = page.children(); if (children.size() == 1 && "prev".equals(children.first().attr("class"))) { // 证明已经到了最后一页 isLastPage = true; ToastAlone.show(R.string.load_all_data); Log.i("cxm", "the last one"); return; } } progressbar.setVisibility(View.GONE); // 使用新的爬虫规则 Elements div_elements = document.select("div.news-list-ul"); if (div_elements != null) { Element div_fir = div_elements.first(); if (div_fir != null) { Elements div_children = div_fir.children(); if (div_children != null) { ArrayList<StoryBean> storyBeans = new ArrayList<StoryBean>(); for (Element element : div_children) { StoryBean bean = new StoryBean(); Element href_elem = element.select("a[href]").first(); String href_str = href_elem.attr("href"); Element img_elem = element.select("img[src]").first(); String img_str = Constans.defualt_pic; if (img_elem != null) { img_str = img_elem.attr("src"); } Element tit_element = element.select("p.tit").first(); String tit_str = tit_element.text(); LogUtil.v(tit_str + "---" + href_str + "---" + img_str); bean.setTitle(tit_str); bean.setImg(img_str); bean.setUrl(href_str); storyBeans.add(bean); } if (mLatestStart == 1) { mStoryAdapter.setData(storyBeans); } else { mStoryAdapter.addData(storyBeans); } } } } else { // 立马启动第二种解析方式 Elements ul_elements = document.select("ul.index-ul"); if (ul_elements != null) { Element ul_fir = ul_elements.first(); Elements ul_children = ul_fir.children(); if (ul_children != null) { ArrayList<StoryBean> storyBeans = new ArrayList<StoryBean>(); for (Element child : ul_children) { StoryBean bean = new StoryBean(); Element href_elem = child.select("a[href]").first(); Element img_elem = child.select("img[src]").first(); String title = href_elem.text(); String content_url = href_elem.attr("href"); String img_url = Constans.defualt_pic; if (img_elem != null) { img_url = img_elem.attr("src"); } bean.setTitle(title); bean.setUrl(content_url); bean.setImg(img_url); storyBeans.add(bean); } if (mLatestStart == 1) { mStoryAdapter.setData(storyBeans); } else { mStoryAdapter.addData(storyBeans); } } } } /*Elements elements = document.select("[class]"); for (Element element : elements) { if (element == null) { Log.v("cxm", "null"); } else { String className = element.className(); if ("index-ul".equals(className)) { Elements elements1 = element.select("li"); Log.v("cxm", "size=" + elements1.size()); ArrayList<StoryBean> storyBeans = new ArrayList<StoryBean>(); for (Element child : elements1) { StoryBean bean = new StoryBean(); Element href = child.select("[href]").first(); String name = href.text(); Element img = child.select("img[src]").first(); Log.w("cxm", "href=" + href.attr("href") + " ~~ name=" + name + "" + " ~~ img="); bean.setTitle(name); bean.setmContentUrl(href.attr("href")); if (null == img) { Log.e("cxm", "img = null"); bean.setPicUrl(""); } else { bean.setPicUrl(img.attr("src")); } storyBeans.add(bean); } if (mLatestStart == 1) { mStoryAdapter.setData(storyBeans); } else { mStoryAdapter.addData(storyBeans); } } } }*/ }
private void parseNotes() { Integer currentVoice; int duration = 0; Integer position; Integer lastDuration; Elements parts = this.doc.select("part"); for (Element part : parts) { position = 0; lastDuration = 0; divisions = 1; for (Element thismeasure : part.getElementsByTag("measure")) { String measure = "0"; if (!thismeasure.getElementsByTag("divisions").isEmpty()) { divisions = Integer.valueOf(thismeasure.getElementsByTag("divisions").text()); } measure = thismeasure.attr("number"); for (Element thisnote : thismeasure.children()) { if (thisnote.tagName().equals("note")) { Note note = new Note(); if (!thisnote.getElementsByTag("voice").isEmpty()) { currentVoice = Integer.valueOf(thisnote.getElementsByTag("voice").text()); note.setVoice(currentVoice); } // get the pitch if (!thisnote.getElementsByTag("pitch").isEmpty()) { for (Element thispitch : thisnote.getElementsByTag("pitch")) { String step = thispitch.getElementsByTag("step").text(); int pitch = getPitchFromStep(step); String octave = thispitch .getElementsByTag("octave") .text() .replaceAll("^\\s+|\\s+$|\\s*(\n)\\s*|(\\s)\\s*", "$1$2"); int octaveInt = Integer.parseInt(octave); note.setOctave(octaveInt); String alter = String.valueOf(thispitch.getElementsByTag("alter").text()); int alterValue = 0; if (!thispitch.getElementsByTag("alter").isEmpty()) { if (alter.equals("1")) { alterValue = 1; } else if (alter.equals("-1")) { alterValue = -1; } else if (alter.equals("2")) { alterValue = 2; } else if (alter.equals("-2")) { alterValue = -2; } } int pitchClass = pitch + alterValue; note.setPitchClass(pitchClass); note.setPitch(pitchClass + 12 * octaveInt); } } else { note.setPitch(cp.model.note.Note.REST); } if (thisnote.getElementsByTag("time-modification").isEmpty()) { switch (thisnote.getElementsByTag("type").text()) { case "16th": duration = DurationConstants.SIXTEENTH; if (!thisnote.getElementsByTag("dot").isEmpty()) { // TODO } break; case "eighth": duration = DurationConstants.EIGHT; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.EIGHT + DurationConstants.SIXTEENTH; } break; case "quarter": duration = DurationConstants.QUARTER; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.QUARTER + DurationConstants.EIGHT; } break; case "half": duration = DurationConstants.HALF; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.HALF + DurationConstants.QUARTER; } break; case "whole": duration = DurationConstants.WHOLE; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.WHOLE + DurationConstants.HALF; } break; default: break; } } else { if (thisnote.getElementsByTag("actual-notes").text().equals("3")) { note.setTriplet(true); note.setTimeModification(thisnote.getElementsByTag("normal-type").text()); switch (thisnote.getElementsByTag("type").text()) { case "16th": duration = DurationConstants.SIXTEENTH_TRIPLET; break; case "eighth": duration = DurationConstants.EIGHT_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.EIGHT_TRIPLET + DurationConstants.SIXTEENTH_TRIPLET; } break; case "quarter": duration = DurationConstants.QUARTER_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.QUARTER_TRIPLET + DurationConstants.EIGHT_TRIPLET; } break; case "half": duration = DurationConstants.HALF_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.HALF_TRIPLET + DurationConstants.QUARTER_TRIPLET; } break; default: break; } } else if (thisnote.getElementsByTag("actual-notes").text().equals("6")) { note.setSextuplet(true); note.setTimeModification(thisnote.getElementsByTag("normal-type").text()); switch (thisnote.getElementsByTag("type").text()) { case "16th": duration = DurationConstants.SIXTEENTH_TRIPLET; break; case "eighth": duration = DurationConstants.EIGHT_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.EIGHT_TRIPLET + DurationConstants.SIXTEENTH_TRIPLET; } break; case "quarter": duration = DurationConstants.QUARTER_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.QUARTER_TRIPLET + DurationConstants.EIGHT_TRIPLET; } break; case "half": duration = DurationConstants.HALF_TRIPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.HALF_TRIPLET + DurationConstants.QUARTER_TRIPLET; } break; default: break; } } else if (thisnote.getElementsByTag("actual-notes").text().equals("5")) { note.setQuintuplet(true); note.setTimeModification(thisnote.getElementsByTag("normal-type").text()); switch (thisnote.getElementsByTag("type").text()) { case "16th": duration = DurationConstants.SIXTEENTH_QUINTUPLET; break; case "eighth": duration = DurationConstants.EIGHT_QUINTUPLET; if (!thisnote.getElementsByTag("dot").isEmpty()) { duration = DurationConstants.EIGHT_QUINTUPLET + DurationConstants.SIXTEENTH_QUINTUPLET; } break; default: break; } } } // duration = Integer.valueOf(thisnote.getElementsByTag("duration").text()); // * // // divMultiplier.get(divisions); // duration = duration * Note.DEFAULT_LENGTH / MusicXMLWriter.DIVISIONS; note.setLength(duration); note.setDisplayLength(duration); // now check if it is a chord if (!thisnote.getElementsByTag("chord").isEmpty()) { // note.setStartTime(position); // retract previous duration note.setPosition(position - lastDuration); } else { // increment start time of the current voice // System.out.print(" start: " + position); note.setPosition(position); position = position + duration; } lastDuration = duration; note.setInstrument(thisnote.getElementsByTag("instrument").attr("id")); if (!thisnote.getElementsByTag("tied").isEmpty()) { if (thisnote.getElementsByTag("tied").attr("type").equals("start")) { note.setTieStart(true); } if (thisnote.getElementsByTag("tied").attr("type").equals("stop")) { note.setTieEnd(true); } } if (!thisnote.getElementsByTag("tuplet").isEmpty()) { Element tuplet = thisnote.getElementsByTag("tuplet").first(); if (tuplet.attr("type").equals("start")) { note.setTupletType(TupletType.START); } if (tuplet.attr("type").equals("stop")) { note.setTupletType(TupletType.STOP); } if (tuplet.attr("bracket").equals("yes")) { note.setBracket(true); } } if (!thisnote.getElementsByTag("beam").isEmpty()) { if (thisnote.getElementsByTag("beam").size() == 1) { if ("begin".equals(thisnote.getElementsByTag("beam").text())) { note.setBeamType(BeamType.BEGIN); } else if ("continue".equals(thisnote.getElementsByTag("beam").text())) { note.setBeamType(BeamType.CONTINUE); } else if ("end".equals(thisnote.getElementsByTag("beam").text())) { note.setBeamType(BeamType.END); } } if (thisnote.getElementsByTag("beam").size() == 2) { Element firsBeam = thisnote.getElementsByTag("beam").get(0); Element secondBeam = thisnote.getElementsByTag("beam").get(1); if ("begin".equals(firsBeam.text()) && "begin".equals(secondBeam.text())) { note.setBeamType(BeamType.BEGIN_BEGIN); } else if ("continue".equals(firsBeam.text()) && "continue".equals(secondBeam.text())) { note.setBeamType(BeamType.CONTINUE_CONTINUE); } else if ("end".equals(firsBeam.text()) && "end".equals(secondBeam.text())) { note.setBeamType(BeamType.END_END); } else if ("continue".equals(firsBeam.text()) && "begin".equals(secondBeam.text())) { note.setBeamType(BeamType.CONTINUE_BEGIN); } else if ("continue".equals(secondBeam.text()) && "end".equals(secondBeam.text())) { note.setBeamType(BeamType.CONTINUE_END); } } } notes.add(note); } else if (thisnote.tagName().equals("forward")) { position = position + Integer.valueOf(thisnote.getElementsByTag("duration").text()); // * divMultiplier.get(divisions); } else if (thisnote.tagName().equals("backup")) { // System.out.println("BACKUP" + // Integer.valueOf(thisnote.getElementsByTag("duration").text()) // * divMultiplier.get(divisions)); position = position - Integer.valueOf(thisnote.getElementsByTag("duration").text()); // * // divMultiplier.get(divisions); } } } } }
private ArrayList<HashMap<String, ArrayList<String>>> fetchResult(String url) { ArrayList<HashMap<String, ArrayList<String>>> results = new ArrayList<HashMap<String, ArrayList<String>>>(); try { Document doc = Jsoup.connect(url).get(); Elements infos = doc.select("div.result"); Elements pagingLinks = doc.select("div.paging").select("a"); for (Element info : infos) { HashMap<String, ArrayList<String>> res = new HashMap<String, ArrayList<String>>(); ArrayList<String> names = new ArrayList<String>(); ArrayList<String> address = new ArrayList<String>(); ArrayList<String> phoneNrs = new ArrayList<String>(); ArrayList<String> titles = new ArrayList<String>(); Elements nm = info.select("span.cut"); if (nm.size() == 0) { // Businesses have have links not spans nm = info.select("a.cut"); } for (Element name : nm) { names.add(name.text()); if (name.children().size() > 0) { Element theTitle = name.child(0); if (theTitle != null) { titles.add(theTitle.text()); } } } Elements adrs = info.select("a.addressinfo"); for (Element adr : adrs) { address.add(adr.text()); } Elements pNrs = info.select("a.phone"); for (Element phoneNr : pNrs) { phoneNrs.add(phoneNr.text()); } res.put(Keys.KEY_NAMES, names); res.put(Keys.KEY_ADDRESSES, address); res.put(Keys.KEY_PHONE_NUMBERS, phoneNrs); res.put(Keys.KEY_TITLES, titles); results.add(res); } if (!pagingLinks.isEmpty()) { Element lastUrl = pagingLinks.last(); String linkText = lastUrl.text().replaceAll("\\s", "").toLowerCase(); if (linkText.equals(Keys.KEY_MORE_RESULTS)) { mNextUrl = BASE_URL + lastUrl.attr("href").replace(" ", "+"); } else { mNextUrl = null; } } else { mNextUrl = null; } } catch (Exception e) { e.printStackTrace(); } finally { return results; } }
public void extractReferences(Document htmlDoc) { Elements references = htmlDoc.select(ContentXPath.REFERENCE.path); Element firstAuthorSNM, firstAuthorFNM, authors, citationAuthorsList, source, volume, fpage, lpage, date, citeComplete, medline, titleElement, publisherName, publisherLocation, referenceUnstructured; Elements citationAuthorsEntries, authorElements; Document referenceHtml, authorsHtml; Reference refInfo; String authorsList, firstPage, lastPage, completePages, title, completeCitation, citeNodeText, medlineLink, authorClass, firstName, lastName; if (references != null) { for (Element reference : references) { refInfo = new Reference(); // doi refInfo.setDoi(reference.attr(ContentXPath.REFERENCE_DOI_ATTR.path)); // parse content of reference referenceHtml = HtmlDocumentUtil.getHtmlDocumentFromString(reference.html()); // check if reference is unstructured referenceUnstructured = referenceHtml.select(ContentXPath.REFERENCE_UNSTRUCTURED.path).first(); if (referenceUnstructured != null) { // TODO: handle unstructured Data } else { // authors authorsList = ""; citationAuthorsList = referenceHtml.select(ContentXPath.AUTHORS2_ROOT.path).first(); if (citationAuthorsList != null) { // citation version 2 authorsHtml = HtmlDocumentUtil.getHtmlDocumentFromString(citationAuthorsList.html()); citationAuthorsEntries = authorsHtml.select(ContentXPath.AUTHORS2_ENTRY.path); for (Element authorEntry : citationAuthorsEntries) { authorElements = authorEntry.children(); lastName = ""; firstName = ""; for (Element authorElement : authorElements) { authorClass = authorElement.attr("class"); if (authorClass.contains(ContentXPath.AUTHORS2_SURNAME.path)) lastName = authorElement.text(); else if (authorClass.contains(ContentXPath.AUTHORS2_FIRSTNAME.path)) firstName = authorElement.text(); } refInfo.addAuthor(refInfo.new Author(lastName, firstName)); } } else { // citation version 1 firstAuthorSNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_SURNAME.path).first(); firstAuthorFNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_FIRSTNAME.path).first(); refInfo.addAuthor(refInfo.new Author(firstAuthorSNM.text(), firstAuthorFNM.text())); authors = referenceHtml.select(ContentXPath.AUTHORS.path).first(); authorsList = ""; if (authors != null) { authorsList = authors.text(); addAuthorsToReference(authorsList, refInfo); } } // reference source/journal source = referenceHtml.select(ContentXPath.CITE_SOURCE.path).first(); if (source != null) { refInfo.setSource(source.text()); } else { source = referenceHtml.select(ContentXPath.CITE_SOURCE_JNL.path).first(); if (source != null) { refInfo.setSource(source.text()); } } // reference volume volume = referenceHtml.select(ContentXPath.CITE_VOLUME.path).first(); if (volume != null) refInfo.setVolume(volume.text()); // reference date date = referenceHtml.select(ContentXPath.CITE_DATE.path).first(); if (date != null) refInfo.setDate(date.text()); // complete citation citeComplete = referenceHtml.select(ContentXPath.CITE_COMPLETE.path).first(); completeCitation = citeComplete.text(); refInfo.setCompleteCitation(completeCitation); // reference title titleElement = referenceHtml.select(ContentXPath.CITE_TITLE.path).first(); if (titleElement != null) { title = titleElement.text(); refInfo.setTitle(title); } else { citeNodeText = citeComplete.ownText(); title = extractTitleFromCitation(citeNodeText); refInfo.setTitle(title); } // reference first page fpage = referenceHtml.select(ContentXPath.CITE_FPAGE.path).first(); firstPage = ""; if (fpage != null) firstPage = fpage.text(); // reference last page lpage = referenceHtml.select(ContentXPath.CITE_LPAGE.path).first(); if (lpage != null) { lastPage = lpage.text(); completePages = firstPage + "-" + lastPage; refInfo.setPages(completePages); } else if (fpage != null) { completePages = extractCompletePagesFromCitation(completeCitation, firstPage); refInfo.setPages(completePages); } // pmid from medline link (if available) medline = referenceHtml.select(ContentXPath.MEDLINE_LINK.path).first(); if (medline != null) { medlineLink = medline.attr(ContentXPath.MEDLINE_LINK_ATTR.path); refInfo.setPmid(extractPMIDFromMedlineLink(medlineLink)); } // publisherName publisherName = referenceHtml.select(ContentXPath.PUBL_NAME.path).first(); if (publisherName != null) refInfo.setPublisherName(publisherName.text()); // publisherLocation publisherLocation = referenceHtml.select(ContentXPath.PUBL_LOC.path).first(); if (publisherLocation != null) refInfo.setPublisherLocation(publisherLocation.text()); } // System.out.print("doi: " + refInfo.getDoi() + "; authors: "); // for(Author author: refInfo.getAuthors()){ // System.out.print(author.getLastName() + ", " + author.getFirstName() + "; "); // } // System.out.print("source: " + refInfo.getSource() + "; volume: " + refInfo.getVolume() // + "; date: " + refInfo.getDate() + "; pages: " + refInfo.getPages() + "; title: " + // title); // System.out.println(); // System.out.println("pages: " + refInfo.getPages()); // System.out.println("pmid: " + refInfo.getPmid()); // System.out.println("citeNode: " + citeNodeText); // System.out.println("title: " + refInfo.getTitle()); // System.out.println("publisher: " + refInfo.getPublisherName() + "; loc: " + // refInfo.getPublisherLocation()); } } }
static void parseCopies(DetailledItem res, Document doc, JSONObject data) throws JSONException { if ("doublestacked".equals(data.optString("copystyle"))) { // e.g. http://vopac.nlg.gr/Record/393668/Holdings#tabnav // for Athens_GreekNationalLibrary Element container = doc.select(".tab-container").first(); String branch = ""; for (Element child : container.children()) { if (child.tagName().equals("h5")) { branch = child.text(); } else if (child.tagName().equals("table")) { int i = 0; String callNumber = ""; for (Element row : child.select("tr")) { if (i == 0) { callNumber = row.child(1).text(); } else { Copy copy = new Copy(); copy.setBranch(branch); copy.setShelfmark(callNumber); copy.setBarcode(row.child(0).text()); copy.setStatus(row.child(1).text()); res.addCopy(copy); } i++; } } } } else if ("stackedtable".equals(data.optString("copystyle"))) { // e.g. http://search.lib.auth.gr/Record/376356 // or https://katalog.ub.uni-leipzig.de/Record/0000196115 // or https://www.stadt-muenster.de/opac2/Record/0367968 Element container = doc.select(".recordsubcontent, .tab-container").first(); // .tab-container is used in Muenster. String branch = ""; JSONObject copytable = data.getJSONObject("copytable"); for (Element child : container.children()) { if (child.tagName().equals("div")) { child = child.child(0); } if (child.tagName().equals("h3")) { branch = child.text(); } else if (child.tagName().equals("table")) { if (child.select("caption").size() > 0) { // Leipzig_Uni branch = child.select("caption").first().ownText(); } int i = 0; String callNumber = null; if ("headrow".equals(copytable.optString("signature"))) { callNumber = child.select("tr").get(0).child(1).text(); } for (Element row : child.select("tr")) { if (i < copytable.optInt("_offset", 0)) { i++; continue; } Copy copy = new Copy(); if (callNumber != null) { copy.setShelfmark(callNumber); } copy.setBranch(branch); Iterator<?> keys = copytable.keys(); while (keys.hasNext()) { String key = (String) keys.next(); if (key.startsWith("_")) continue; if (copytable.optString(key, "").contains("/")) { // Leipzig_Uni String[] splitted = copytable.getString(key).split("/"); int col = Integer.parseInt(splitted[0]); int line = Integer.parseInt(splitted[1]); int j = 0; for (Node node : row.child(col).childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("br")) { j++; } else if (j == line) { copy.set(key, ((Element) node).text()); } } else if (node instanceof TextNode && j == line && !((TextNode) node).text().trim().equals("")) { copy.set(key, ((TextNode) node).text()); } } } else { // Thessaloniki_University if (copytable.optInt(key, -1) == -1) continue; String value = row.child(copytable.getInt(key)).text(); copy.set(key, value); } } res.addCopy(copy); i++; } } } } }