public static void main(String[] args) throws Exception { try { HtmlCleaner cleaner = new HtmlCleaner(); nameList = new ArrayList<String>(); URL url = new URL( "http://apps.wandoujia.com/apps/com.eg.android.AlipayGphone/versions?pos=w/popup"); TagNode node = cleaner.clean(url); Object[] tags = node.evaluateXPath("/body/div//div[@class='version-block']/div[position()<4]"); int i = 1; for (Object tag : tags) { // System.out.println(((TagNode)tagSize).getText()+""); Object[] tagVersion = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + i + "]//i[@itemprop='softwareVersion']"); String app_verison = ((TagNode) tagVersion[0]).getText() + ""; System.out.println(((TagNode) tagVersion[0]).getText() + ""); Object[] tagVersionCode = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + i + "]//span[@class='version-code']"); String app_versioncode = ((TagNode) tagVersionCode[0]).getText() + ""; System.out.println(((TagNode) tagVersionCode[0]).getText() + ""); Object[] tagFileSize = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + i + "]//span[@class='apk-size']"); String app_size = ((TagNode) tagFileSize[0]).getText() + ""; System.out.println(((TagNode) tagFileSize[0]).getText() + ""); Object[] tagDownload = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + i + "]//a[@download]"); System.out.println(((TagNode) tagDownload[0]).getAttributeByName("href")); String app_url = ((TagNode) tagDownload[0]).getAttributeByName("href"); String app_name = ((TagNode) tagDownload[0]).getAttributeByName("download"); i++; // ***写入数据库 明天写*** } } catch (Exception exception) { exception.printStackTrace(); } }
private void processSelectSource(TagNode formNode, FormFlow formFlow) throws XPatherException, ResourceLoaderException { Object[] dynamicSelectNodes = formNode.evaluateXPath("//select[@" + Constants.SELECT_SOURCE_ATTR + "]"); for (Object dynamicSelectNodeO : dynamicSelectNodes) { TagNode dynamicSelectNode = (TagNode) dynamicSelectNodeO; String name = dynamicSelectNode.getAttributeByName(Constants.NAME_ATTR); String source = dynamicSelectNode.getAttributeByName(Constants.SELECT_SOURCE_ATTR); source = formFlow.resolveResourcePathIfRelative(source); String preselectFirstOption = dynamicSelectNode.getAttributeByName(Constants.SELECT_PRESELECT_FIRST_OPTION_ATTR); dynamicSelectNode.removeAttribute(Constants.SELECT_SOURCE_ATTR); dynamicSelectNode.removeAttribute(Constants.SELECT_PRESELECT_FIRST_OPTION_ATTR); logger.debug("Found dynamicSelectNode name:{}, source:{}", name, source); List<SelectOptionPojo> options = selectOptionHelper.loadOptions(source); if (!"true".equals(preselectFirstOption)) { options.add(0, new SelectOptionPojo("-- Please Select --", "")); } for (SelectOptionPojo selectOptionPojo : options) { TagNode optionNode = new TagNode("option"); String value = selectOptionPojo.getValue(); if (value != null) { optionNode.setAttribute("value", value); } optionNode.addChild(new ContentNode(selectOptionPojo.getText())); dynamicSelectNode.addChild(optionNode); } } // TODO: validate that submitted value comes from the list }
@Override public List<String> selectList(String text) { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); if (tagNode == null) { return null; } List<String> results = new ArrayList<String>(); try { Object[] objects = tagNode.evaluateXPath(xpathStr); if (objects != null && objects.length >= 1) { for (Object object : objects) { if (object instanceof TagNode) { TagNode tagNode1 = (TagNode) object; results.add(htmlCleaner.getInnerHtml(tagNode1)); } else { results.add(object.toString()); } } } } catch (XPatherException e) { e.printStackTrace(); } return results; }
protected void getFileAttache( SimpleHtmlSerializer htmlSerializer, TagNode pNode, KnouNoticeInfo knouNoticeInfo) { String expressionContent = "//div[@class=\"MultiFile-list\"]"; Object[] myNodeBody = null; try { myNodeBody = pNode.evaluateXPath(expressionContent); } catch (XPatherException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (myNodeBody.length <= 0) { return; } TagNode tmpNode = (TagNode) myNodeBody[0]; TagNode[] nl = tmpNode.getChildTags(); // Log.d("HAN", "nl.length:" + nl.length); KnouNoticeFileInfo attacheFileInfo = null; for (int i = 0; i < nl.length; i++) { // 0번은 필요없는거 지움 // Log.d("HAN", "nl[i].getName():" + nl[i].getName()); // Log.d("HAN", "nl[i].getText():" + nl[i].getText()); attacheFileInfo = new KnouNoticeFileInfo(); if (nl[i].getName().trim().equals("a")) { String href = nl[i].getAttributeByName("href"); attacheFileInfo.href = "http://ep.knou.ac.kr" + href; attacheFileInfo.fileName = nl[i].getText().toString(); knouNoticeInfo.AttacheFile.add(attacheFileInfo); } } }
public String GetInnerTextByXpath(String html, String xpath) throws Exception { HtmlCleaner hc = new HtmlCleaner(); TagNode tn = hc.clean(html); Object[] objarr = null; objarr = tn.evaluateXPath(xpath); TagNode newNode = (TagNode) objarr[0]; return newNode.getText().toString(); }
public static ArrayList<String> dobisliko(TagNode node, String XPathExpression) { TagNode description_node = null; ArrayList<String> Temp = new ArrayList<String>(); NodeList nodes; try { // description_node = (TagNode) node.evaluateXPath(XPathExpression)[0]; for (int x = 0; x < node.evaluateXPath(XPathExpression).length; x++) { description_node = (TagNode) node.evaluateXPath(XPathExpression)[x]; // // System.out.println("http://www.krka.si"+description_node.getAttributeByName("src")+"\n"+"---------------------------------------"); Temp.add("http://www.krka.si" + description_node.getAttributeByName("src").toString()); } } catch (XPatherException e) { e.printStackTrace(); } return Temp; // // System.out.println(description_node.getText()+"\n"+"---------------------------------------"); }
public static ArrayList<String> dobi_opis(TagNode node, String XPathExpression) { ArrayList<String> Temp = new ArrayList<String>(); TagNode description_node = null; NodeList nodes; try { // description_node = (TagNode) node.evaluateXPath(XPathExpression)[0]; for (int x = 0; x < node.evaluateXPath(XPathExpression).length; x++) { description_node = (TagNode) node.evaluateXPath(XPathExpression)[x]; // // System.out.println(description_node.getText()+"\n"+"---------------------------------------"); Temp.add(description_node.getText().toString()); } } catch (XPatherException e) { e.printStackTrace(); } return Temp; // // System.out.println(description_node.getText()+"\n"+"---------------------------------------"); }
public static void main(String[] args) throws Exception { try { HtmlCleaner cleaner = new HtmlCleaner(); // cleaner.clean(new File("s")); URL url = new URL("http://www.baidu.com"); TagNode node = cleaner.clean(url, "utf-8"); node.Object[] tagNodes = node.evaluateXPath("//p[@id='nv']/a"); for (Object tagNode : tagNodes) { System.out.println(((TagNode) tagNode).getText()); System.out.println(((TagNode) tagNode).getAttributeByName("href")); } } catch (Exception exception) { exception.printStackTrace(); } }
private void processInputSourceFields(TagNode formNode, String currentPath, FormFlow formFlow) throws XPatherException { Object[] autoCompleteNodes = formNode.evaluateXPath("//input[@" + Constants.SELECT_SOURCE_ATTR + "]"); for (Object autoCompleteNodeO : autoCompleteNodes) { TagNode autoCompleteNode = (TagNode) autoCompleteNodeO; String fieldName = autoCompleteNode.getAttributeByName(Constants.NAME_ATTR); String source = autoCompleteNode.getAttributeByName(Constants.INPUT_SOURCE_ATTR); FieldSourceProxy fieldSourceProxy = proxyFactory.createFlowProxy(currentPath, fieldName, source); formFlow.addFieldSourceProxy(fieldSourceProxy); autoCompleteNode.removeAttribute(Constants.INPUT_SOURCE_ATTR); autoCompleteNode.setAttribute( "rf.source", "rhinoforms/proxy/" + fieldSourceProxy.getProxyPath()); } }
public void parseScoreSheet(String id) throws XPatherException, ParseException { ScoreSheetEntity scoreSheet = em.find(ScoreSheetEntity.class, id); TagNode html = cleaner.clean(scoreSheet.getContent()); // Race ------------------------------------------ String name = ((TagNode) html.evaluateXPath("//body//h1")[0]).getText().toString(); RaceEntity r = new RaceEntity(); r.setName(name); raceEntity = (RaceEntity) checkPossibleMatches(r, RaceEntity.class); // RaceVolume ------------------------------------------ RaceVolumeEntity rv = new RaceVolumeEntity(); String dateStr = ((TagNode) html.evaluateXPath("//body//div[@class='date']")[0]).getText().toString(); Date d = new SimpleDateFormat("dd. MM. yyyy").parse(dateStr); rv.setDate(d); rv.setRace(raceEntity); String vol = ((TagNode) html.evaluateXPath("//body//div[@class='volume']")[0]).getText().toString(); vol = vol.substring(0, vol.indexOf(".")); rv.setVolume(Integer.valueOf(vol)); raceVolume = (RaceVolumeEntity) checkPossibleMatches(rv, RaceVolumeEntity.class); man = true; TagNode menDiv = ((TagNode) html.evaluateXPath("//body//div[@id='men']")[0]); Object[] cats = menDiv.evaluateXPath("//table"); for (int i = 0; i < cats.length; i++) { TagNode c = (TagNode) cats[i]; processCategory(c); } man = false; TagNode womenDiv = ((TagNode) html.evaluateXPath("//body//div[@id='women']")[0]); cats = womenDiv.evaluateXPath("//table"); for (int i = 0; i < cats.length; i++) { TagNode c = (TagNode) cats[i]; processCategory(c); } }
@Override public String select(String text) { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); if (tagNode == null) { return null; } try { Object[] objects = tagNode.evaluateXPath(xpathStr); if (objects != null && objects.length >= 1) { if (objects[0] instanceof TagNode) { TagNode tagNode1 = (TagNode) objects[0]; return htmlCleaner.getInnerHtml(tagNode1); } else { return objects[0].toString(); } } } catch (XPatherException e) { e.printStackTrace(); } return null; }
public List<InstitutionDataItem> getData(int year) throws MalformedURLException, IOException, XPatherException { String url = String.format(URL_PATTERN, year); System.out.println("reading from " + url); ArrayList<InstitutionDataItem> data = new ArrayList<InstitutionDataItem>(); TagNode cleaned = ScraperUtils.getCleanedHtml(url); Object[] rows = cleaned.evaluateXPath(DATA_ROW_XPATH); for (Object row : rows) { TagNode tr = (TagNode) row; InstitutionDataItem dataItem = new InstitutionDataItem(); String[] attributes = attributes2008; if (year >= 2010) attributes = attributes2010; if (year >= 2016) attributes = attributes2016; if (year >= 2017) attributes = attributes2017; dataItem.data.put("year", String.valueOf(year)); for (int i = 0; i < attributes.length; i++) { if (!attributes[i].equals(IGNORE)) { dataItem.data.put( attributes[i], tr.getChildTags()[i].getText().toString().trim().replace(",", "")); } } data.add(dataItem); } return data; }
public static void getSongs( ArchiveShowObj show, ArrayList<ArchiveSongObj> songs, StaticDataStore db, boolean processSongs) { HtmlCleaner pageParser = new HtmlCleaner(); CleanerProperties props = pageParser.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); ArrayList<String> songLinks = new ArrayList<String>(); ArrayList<String> songTitles = new ArrayList<String>(); String showTitle = show.getArtistAndTitle(); String showIdent = show.getIdentifier(); // XPATH says "Select out of all 'table' elements with attribute 'class' // equal to 'fileFormats' which contain element 'tr'..." // String songXPath = "//table[@class='fileFormats']//tr"; // XPATH says "Select out of all 'script' elements with attribute 'type' // equal to 'text/javascript'..." String m3uXPath = "//script"; String titlePath = "//head//title"; if (db.getShowExists(show) && processSongs) { songs.addAll(db.getSongsFromShow(show.getIdentifier())); show.setFullTitle(db.getShow(show.getIdentifier()).getArtistAndTitle()); return; } try { HttpParams params = new BasicHttpParams(); int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS); HttpConnectionParams.setConnectionTimeout(params, timeout); HttpConnectionParams.setSoTimeout(params, timeout); HttpClient client = new DefaultHttpClient(params); HttpGet page = new HttpGet(show.getShowURL().toString()); HttpResponse pageResponse = client.execute(page); StatusLine pageStatus = pageResponse.getStatusLine(); if (pageStatus.getStatusCode() == HttpStatus.SC_OK) { ResponseHandler<String> pageResponseHandler = new BasicResponseHandler(); TagNode node = pageParser.clean(pageResponseHandler.handleResponse(pageResponse)); String queryString = show.getLinkPrefix(); if (db.getPref("downloadFormat").equalsIgnoreCase("LBR")) { if (show.hasLBR()) { queryString += "_64kb.m3u"; } else if (show.hasVBR()) { queryString += "_vbr.m3u"; } } else { if (show.hasVBR()) { queryString += "_vbr.m3u"; } else if (show.hasLBR()) { queryString += "_64kb.m3u"; } } HttpGet M3Urequest = new HttpGet(queryString); HttpResponse M3Uresponse = client.execute(M3Urequest); StatusLine M3Ustatus = M3Uresponse.getStatusLine(); if (M3Ustatus.getStatusCode() == HttpStatus.SC_OK) { ResponseHandler<String> M3UresponseHandler = new BasicResponseHandler(); String m3uString = M3UresponseHandler.handleResponse(M3Uresponse); client.getConnectionManager().shutdown(); // Now split the .M3U file based on newlines. This will give // us the download links, which we store.. String m3uLinks[] = m3uString.split("\n"); for (String link : m3uLinks) { songLinks.add(link); } // Now use an XPATH evaluation to find all of the javascript scripts on the page. // If one of them can be split by "IAD.mrss = ", it should have the track names // in it. The second half of the split is valid javascript and can be interpreted, // therefore, as JSON. Pull the song titles out of that, and together with the // download links make ArchiveSongObjs and add them to the list of songs. Object[] titleNodes = node.evaluateXPath(m3uXPath); for (Object titleNode : titleNodes) { // List x = ((TagNode) titleNode).getChildren(); String songTitle = ""; for (Object y : x) { if (y instanceof ContentNode) { songTitle = ((ContentNode) y).toString(); songTitle = songTitle.trim(); if (songTitle.startsWith("Play(")) { String[] titles = songTitle.split("\\{\"title\""); for (int i = 1; i < titles.length; i++) { try { String title = titles[i].substring( nthIndexOf(titles[i], '"', 1), nthIndexOf(titles[i], '"', 2)); songTitles.add(title.substring(title.indexOf('.') + 2)); } catch (StringIndexOutOfBoundsException e) { } } } } } } if (show.getShowTitle().length() < 2) { String s = ((TagNode) node.evaluateXPath(titlePath)[0]) .getChildren() .toString() .replaceFirst(Pattern.quote("["), ""); show.setFullTitle(s.substring(0, s.lastIndexOf(": Free") - 1)); showTitle = show.getArtistAndTitle(); db.updateShow(show); } if (processSongs) { if (songLinks.size() == 0) { } else { // Do things for successful show parse db.insertShow(show); } // If we have the same amount of song titles as song links, // we should be all set. if (songTitles.size() == songLinks.size()) { for (int i = 0; i < songTitles.size(); i++) { String songLink = songLinks.get(i); String songTitle = songTitles.get(i); // If the show has a "selectedSong" // meaning that it was opened by // the user clicking on a song link, do // a comparison to see // if the song being added is the // selected song. If it is, set // selectedPos to the right index so // that the song can be played // once the ListView is filled. This is // inefficient, though it probably doesn't make a difference, // but we might consider making this a bit more efficient/elegant in the future. // FIXME. // if (show.hasSelectedSong()) { // if (songLink.equals(show.getSelectedSong())) { // selectedPos = i; // } // } else { // selectedPos = -1; // } ArchiveSongObj song = new ArchiveSongObj(songTitle, songLink, showTitle, showIdent); song.setID(db.insertSong(song)); songs.add(song); } db.setShowExists(show); db.insertRecentShow(show); } else { } } } else { client.getConnectionManager().shutdown(); } } else { client.getConnectionManager().shutdown(); } } catch (XPatherException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // TODO Auto-generated method stub }
public static Boolean updateArtists(StaticDataStore db) { ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>(); int numArtists; HtmlCleaner pageParser = new HtmlCleaner(); CleanerProperties props = pageParser.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); try { String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree"; HttpParams params = new BasicHttpParams(); int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS); HttpConnectionParams.setConnectionTimeout(params, timeout); HttpConnectionParams.setSoTimeout(params, timeout); HttpClient client = new DefaultHttpClient(params); HttpGet request = new HttpGet(url); HttpResponse response = client.execute(request); StatusLine status = response.getStatusLine(); if (status.getStatusCode() == HttpStatus.SC_OK) { ResponseHandler<String> responseHandler = new BasicResponseHandler(); TagNode node = pageParser.clean(responseHandler.handleResponse(response)); client.getConnectionManager().shutdown(); // XPATH to get the nodes that we Want. Object[] artistsNodes = node.evaluateXPath("//tr[@valign='top']//li"); numArtists = artistsNodes.length; for (int i = 0; i < numArtists; i++) { // Cast the artistNode as a TagNode. TagNode artist = ((TagNode) artistsNodes[i]); // Grab the first child node, which is the link to the artist's page. // The inner HTML of this node will be the title. TagNode artistTitleSubNode = artist.getChildTags()[0]; // Remove the child node, so that the inner HTML of the artistNode // only contains the number of shows that the artist has. artist.removeChild(artistTitleSubNode); String artistTitle = pageParser.getInnerHtml(artistTitleSubNode); if (artistTitle != null) { ArrayList<String> artistPair = new ArrayList<String>(); artistPair.add( artistTitle .replace("'", "'") .replace(">", ">") .replace("<", "<") .replace(""", "\"") .replace("&", "&")); artistPair.add(pageParser.getInnerHtml(artist).trim()); /* * VibeVault.db.addArtist(artistTitle, pageParser * .getInnerHtml(artist).trim()); */ artists.add(artistPair); } } if (artists.size() > 0) { db.insertArtistBulk(artists); String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString(); db.updatePref("artistUpdate", s); } else { } } else { client.getConnectionManager().shutdown(); } } catch (Exception e) { e.printStackTrace(); } return true; }
private void processCategory(TagNode table) throws XPatherException, ParseException { String categoryStr = ((TagNode) table.evaluateXPath("./caption")[0]).getText().toString().trim(); CategoryEntity cat = new CategoryEntity(); cat.setMan(man); cat.setName(categoryStr); cat.getRaces().add(raceVolume); if (categoryStr.contains("39")) { cat.setFromAge(0); cat.setToAge(39); } else if (categoryStr.contains("70")) { cat.setFromAge(70); cat.setToAge(1000); } else if (categoryStr.contains("-")) { int ind = categoryStr.indexOf("-"); String from = categoryStr.substring(ind - 2, ind); String to = categoryStr.substring(ind, ind + 2); cat.setFromAge(Integer.valueOf(from)); cat.setToAge(Integer.valueOf(to)); } else if (categoryStr.contains("34")) { cat.setFromAge(0); cat.setToAge(34); } else if (categoryStr.contains("45")) { cat.setFromAge(45); cat.setToAge(1000); } else { throw new IllegalArgumentException("Category cannot be processed"); } cat = (CategoryEntity) checkPossibleMatches(cat, cat.getClass()); Object[] runners = table.evaluateXPath("/tbody/tr"); for (int i = 0; i < runners.length; i++) { TagNode row = (TagNode) runners[i]; // Person ------------- String name = ((TagNode) row.evaluateXPath("/td[@class='jmeno']")[0]).getText().toString().trim(); int index = name.indexOf(" "); String sn = name.substring(0, index).trim(); String fn = name.substring(index).trim(); String bd = ((TagNode) row.evaluateXPath("/td[@class='rn']")[0]).getText().toString().trim(); PersonEntity person = new PersonEntity(fn, sn, Integer.valueOf(bd), man); person = (PersonEntity) checkPossibleMatches(person, PersonEntity.class); // Club and city ------------- String all = ((TagNode) row.evaluateXPath("/td[@class='klub']")[0]).getText().toString().trim(); int delI = all.indexOf("("); String clubStr; String cityStr; if (delI >= 0) { clubStr = all.substring(0, delI).trim(); cityStr = all.substring(delI); cityStr = cityStr.replace("(", "").replace(")", "").trim(); } else { clubStr = all.trim(); cityStr = ""; } if (clubStr.isEmpty()) { clubStr = "-"; } if (cityStr.isEmpty()) { cityStr = "-"; } ClubEntity club = new ClubEntity(clubStr); club = (ClubEntity) checkPossibleMatches(club, club.getClass()); CityEntity city = new CityEntity(cityStr); city = (CityEntity) checkPossibleMatches(city, city.getClass()); // Number ------------- Integer number = Integer.valueOf( ((TagNode) row.evaluateXPath("/td[@class='stCislo']")[0]) .getText() .toString() .trim()); // Time ------------- String timeStr = ((TagNode) row.evaluateXPath("/td[@class='cas']")[0]).getText().toString().trim(); if (timeStr.equalsIgnoreCase("NF")) { timeStr = "23:59:59"; } Date time = new SimpleDateFormat("H:mm:ss").parse(timeStr); // Runner ------------- RunnerEntity runner = new RunnerEntity(number, person, club, city, time); runner.setCategory(cat); runner.setRace(raceVolume); checkPossibleMatches(runner, runner.getClass()); } }
public static void main(String[] args) { // 检查数据库 try { Class.forName("org.gjt.mm.mysql.Driver"); System.out.println("Success loading Mysql Driver!"); Connection connect = DriverManager.getConnection("jdbc:mysql://*****:*****@class='version-block']/div[" + "1" + "]//i[@itemprop='softwareVersion']"); app_verison = ((TagNode) tagVersion[0]).getText() + ""; System.out.println(((TagNode) tagVersion[0]).getText() + ""); Object[] tagVersionCode = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + "1" + "]//span[@class='version-code']"); String app_versioncode = ((TagNode) tagVersionCode[0]).getText() + ""; System.out.println(((TagNode) tagVersionCode[0]).getText() + ""); Object[] tagFileSize = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + "1" + "]//span[@class='apk-size']"); String app_size = ((TagNode) tagFileSize[0]).getText() + ""; System.out.println(((TagNode) tagFileSize[0]).getText() + ""); Object[] tagDownload = node.evaluateXPath( "/body/div//div[@class='version-block']/div[" + "1" + "]//a[@download]"); String app_url1 = ((TagNode) tagDownload[0]).getAttributeByName("href"); app_url = app_url1.replaceAll("&", "&"); System.out.println("下载地址: " + app_url + "\n"); app_name = ((TagNode) tagDownload[0]).getAttributeByName("download"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (XPatherException e) { e.printStackTrace(); } // 对比数据库 判断app是否最新 if (app_verison != sql_version) { IsNewest = false; } }
@Override public boolean authenticate() { if (!super.authenticate()) { LOG.error( String.format( "blank username or password detected, no %s xword will be downloaded", this.getType())); return false; } final HttpUriRequest loginGet = RequestBuilder.get().setUri(NYT_LOGIN_URL).build(); final String loginPage; try (final CloseableHttpResponse getResponse = this.getHttpClient().execute(loginGet)) { loginPage = EntityUtils.toString(getResponse.getEntity()); } catch (final IOException e) { LOG.error("error while navigating to NYT login page", e); return false; } final String token; final String expires; try { final TagNode node = this.getCleaner().clean(loginPage); final Object[] foundNodes = node.evaluateXPath("//input[@name='token']"); if (foundNodes.length != 1) { this.throwLoginException( "unexpected login page, found %d hidden token input elements, expected 1", foundNodes.length); } final TagNode hiddenTokenInput = (TagNode) foundNodes[0]; token = hiddenTokenInput.getAttributeByName("value"); LOG.debug("found hidden input token {}", token); final Object[] foundExpiresNodes = node.evaluateXPath("//input[@name='expires']"); if (foundExpiresNodes.length != 1) { this.throwLoginException( "unexpected login page, found %d hidden token expiration input elements, expected 1", foundNodes.length); } final TagNode hiddenTokenExpiresInput = (TagNode) foundExpiresNodes[0]; expires = hiddenTokenExpiresInput.getAttributeByName("value"); LOG.debug("found hidden input token expiration {}", expires); } catch (LoginException | XPatherException e) { LOG.error("error while pulling login tokens from NYT login page", e); return false; } // @formatter:off final HttpUriRequest loginPost = RequestBuilder.post() .setUri("https://myaccount.nytimes.com/auth/login") .addParameter("is_continue", Boolean.FALSE.toString()) .addParameter("token", token) .addParameter("expires", expires) .addParameter("userid", this.getLoginInfo().getUsername()) .addParameter("password", this.getLoginInfo().getPassword()) .addParameter("remember", Boolean.TRUE.toString()) .build(); // @formatter:on try (CloseableHttpResponse postResponse = this.getHttpClient().execute(loginPost)) { // successful NYT login should give 302 status final int responseStatus = postResponse.getStatusLine().getStatusCode(); if (responseStatus != 302) { final String errorMessage = String.format("did not detect expected 302 redirect, got %d instead", responseStatus); throw new LoginException(errorMessage); } // successful NYT login redirects to the NYT homepage final Header location = postResponse.getFirstHeader("Location"); // have seen this redirect both with and without the final portion final Pattern expectedRedirectLocation = Pattern.compile("http://www.nytimes.com(\\?login=email)*"); final String actualRedirectLocation = location.getValue(); final Matcher matcher = expectedRedirectLocation.matcher(actualRedirectLocation); if (!matcher.matches()) { final String errorMessage = String.format( "redirect to unexpected URL, expected %s, found Location=%s instead", expectedRedirectLocation, actualRedirectLocation); throw new LoginException(errorMessage); } // successful NYT login should set a few cookies final Header[] cookies = postResponse.getHeaders("Set-Cookie"); if (cookies.length < 1) { throw new LoginException("no post login cookies set, login likely failed"); } } catch (final IOException | LoginException e) { LOG.error("error while logging in, e={}", e.getMessage()); return false; } LOG.info("successfully logged in to nyt"); return true; }
private void recordInputFields( TagNode formNode, FormFlow formFlow, Document dataDocument, String docBase) throws XPathExpressionException, XPatherException { List<InputPojo> inputPojos = new ArrayList<InputPojo>(); Map<String, InputPojo> inputPojosMap = new HashMap<String, InputPojo>(); @SuppressWarnings("unchecked") List<TagNode> inputs = formNode.getElementListByName("input", true); @SuppressWarnings("unchecked") List<TagNode> selects = formNode.getElementListByName("select", true); inputs.addAll(selects); for (TagNode inputTagNode : inputs) { String name = inputTagNode.getAttributeByName(Constants.NAME_ATTR); if (name != null) { String type; if (inputTagNode.getName().equals("select")) { type = "select"; } else { type = inputTagNode.getAttributeByName(Constants.TYPE_ATTR); } if (type != null) { if (!(type.equals("radio") && inputPojosMap.containsKey(name))) { // Collect all rf.xxx attributes Map<String, String> rfAttributes = new HashMap<String, String>(); Map<String, String> attributes = inputTagNode.getAttributes(); for (String attName : attributes.keySet()) { if (attName.startsWith("rf.")) { rfAttributes.put(attName, attributes.get(attName)); } } InputPojo inputPojo = new InputPojo(name, type, rfAttributes); inputPojosMap.put(name, inputPojo); inputPojos.add(inputPojo); } // Push values from the dataDocument into the form html. String inputValue = lookupValueByFieldName(dataDocument, name, docBase); if (inputValue != null) { if (type.equals("radio")) { String value = inputTagNode.getAttributeByName(Constants.VALUE_ATTR); if (inputValue.equals(value)) { inputTagNode.setAttribute(Constants.CHECKED_ATTR, Constants.CHECKED_ATTR); } } else if (type.equals("checkbox")) { if (inputValue.equals("true")) { inputTagNode.setAttribute(Constants.CHECKED_ATTR, Constants.CHECKED_ATTR); } } else if (type.equals("select")) { Object[] nodes = inputTagNode.evaluateXPath("option[@value=\"" + inputValue + "\"]"); if (nodes.length == 0) { nodes = inputTagNode.evaluateXPath("option[text()=\"" + inputValue + "\"]"); } if (nodes.length > 0) { ((TagNode) nodes[0]).setAttribute(Constants.SELECTED_ATTR, "selected"); } } else { inputTagNode.setAttribute("value", inputValue); } } } else { logger.debug("Input name:{} has no type attribute!", name); } } } formFlow.setCurrentInputPojos(inputPojos); }
public void parseForm( InputStream formStream, FormFlow formFlow, PrintWriter writer, JSMasterScope masterScope, boolean suppressDebugBar) throws XPatherException, XPathExpressionException, IOException, ResourceLoaderException, FormParserException { TagNode formHtml = htmlCleaner.clean(formStream); String flowID = formFlow.getId(); Document dataDocument = formFlow.getDataDocument(); String docBase = formFlow.getCurrentDocBase(); String currentPath = formFlow.getCurrentPath(); Map<String, FlowAction> currentActions = formFlow.getCurrentActions(); // Process rf.include processIncludes(formHtml, formFlow); // Add debugBar if (showDebugBar && !suppressDebugBar) { addDebugBar(formHtml); } // Process rf.forEach statements valueInjector.processForEachStatements(formHtml, dataDocument, docBase); valueInjector.processRemainingCurlyBrackets(formHtml, dataDocument, docBase, flowID); // Process first Rhinoforms form in doc Object[] rfFormNodes = formHtml.evaluateXPath("//form[@" + Constants.RHINOFORMS_FLAG + "='true']"); if (rfFormNodes.length > 0) { logger.debug("{} forms found.", rfFormNodes.length); TagNode formNode = (TagNode) rfFormNodes[0]; // Process dynamic select elements processSelectSource(formNode, formFlow); // Process range select elements processSelectRange(formNode, masterScope); // Record input fields recordInputFields(formNode, formFlow, dataDocument, docBase); // Process Actions processActions(currentActions, formNode); // Process auto-complete fields, replace source with proxy path processInputSourceFields(formNode, currentPath, formFlow); // Add flowId as hidden field addFlowId(flowID, formNode); // Mark form as parsed formNode.setAttribute("parsed", "true"); } else { logger.warn("No forms found"); } // Write out processed document new SimpleHtmlSerializer(htmlCleaner.getProperties()).write(formHtml, writer, "utf-8"); }
private void processSelectRange(TagNode formNode, JSMasterScope masterScope) throws XPatherException { Object[] rangeSelectNodes = formNode.evaluateXPath("//select[@" + Constants.SELECT_RANGE_START_ATTR + "]"); if (rangeSelectNodes.length > 0) { Scriptable workingScope = masterScope.createWorkingScope(); Context context = masterScope.getCurrentContext(); for (Object rangeSelectNodeO : rangeSelectNodes) { TagNode rangeSelectNode = (TagNode) rangeSelectNodeO; String name = rangeSelectNode.getAttributeByName(Constants.NAME_ATTR); String rangeStart = rangeSelectNode.getAttributeByName(Constants.SELECT_RANGE_START_ATTR); String rangeEnd = rangeSelectNode.getAttributeByName(Constants.SELECT_RANGE_END_ATTR); String preselectFirstOption = rangeSelectNode.getAttributeByName(Constants.SELECT_PRESELECT_FIRST_OPTION_ATTR); rangeSelectNode.removeAttribute(Constants.SELECT_RANGE_START_ATTR); rangeSelectNode.removeAttribute(Constants.SELECT_RANGE_END_ATTR); rangeSelectNode.removeAttribute(Constants.SELECT_PRESELECT_FIRST_OPTION_ATTR); logger.debug( "Found rangeSelectNode name:{}, rangeStart:{}, rangeEnd:{}", new String[] {name, rangeStart, rangeEnd}); boolean rangeStartValid = rangeStart != null && !rangeStart.isEmpty(); boolean rangeEndValid = rangeEnd != null && !rangeEnd.isEmpty(); if (rangeStartValid && rangeEndValid) { Object rangeStartResult = context.evaluateString( workingScope, "{" + rangeStart + "}", Constants.SELECT_RANGE_START_ATTR, 1, null); Object rangeEndResult = context.evaluateString( workingScope, "{" + rangeEnd + "}", Constants.SELECT_RANGE_END_ATTR, 1, null); logger.debug( "RangeSelectNode name:{}, rangeStartResult:{}, rangeEndResult:{}", new Object[] {name, rangeStartResult, rangeEndResult}); double rangeStartResultNumber = Context.toNumber(rangeStartResult); double rangeEndResultNumber = Context.toNumber(rangeEndResult); String comparator; String incrementor; if (rangeStartResultNumber < rangeEndResultNumber) { comparator = "<="; incrementor = "++"; } else { comparator = ">="; incrementor = "--"; } String rangeStatement = "{ var range = []; for( var i = " + rangeStartResult + "; i " + comparator + " " + rangeEndResult + "; i" + incrementor + ") { range.push(i); }; '' + range; }"; logger.debug("RangeSelectNode name:{}, rangeStatement:{}", name, rangeStatement); String rangeResult = (String) context.evaluateString(workingScope, rangeStatement, "Calculate range", 1, null); logger.debug("RangeSelectNode name:{}, rangeResult:{}", name, rangeResult); if (!"true".equals(preselectFirstOption)) { TagNode optionNode = new TagNode("option"); optionNode.setAttribute("value", ""); optionNode.addChild(new ContentNode("-- Please Select --")); rangeSelectNode.addChild(optionNode); } for (String item : rangeResult.split(",")) { TagNode optionNode = new TagNode("option"); optionNode.addChild(new ContentNode(item)); rangeSelectNode.addChild(optionNode); } } else { logger.warn( "Range select node '{}' not processed because {} is empty.", name, (rangeStartValid ? Constants.SELECT_RANGE_START_ATTR : Constants.SELECT_RANGE_END_ATTR)); } } } }