String extractContentFromDocument( LesothoCourtDocument scd, String pdfFile, int startpage, int endpage) { Document document = null; File input = new File(pdfFile); String textContent = null; String extractedCaseId = ""; ArrayList<String> participants = new ArrayList<String>(); String extractedDecisionDate = ""; String participantsString = ""; if (pdfFile == null) { System.out.println("File name is not valid"); } else { try { document = Jsoup.parse(input, "UTF-8"); String textToBeExtracted = document.select("h1").text(); String[] extractedElements = textToBeExtracted.toUpperCase().split("V"); if (extractedElements != null) { participants.add(extractedElements[0]); String[] extractedEle = null; extractedEle = (extractedElements[extractedElements.length - 1]).split("\\("); participants.add(extractedEle[0]); for (String s : participants) { participantsString += s + "\t"; } System.out.println(participantsString); extractedCaseId = extractedEle[1]; extractedCaseId = extractedCaseId.replaceAll("\\)", ""); textContent = document.body().toString(); String temp = "Judgment Date:"; /** * int startIndex = textContent.indexOf(temp)+temp.length(); String subContent = * textContent.substring(startIndex); int endIndex = subContent.indexOf("\n"); * */ Element decDate = document.select("span.date-display-single").first(); extractedDecisionDate = decDate.text(); System.out.println("Decision Date: " + extractedDecisionDate); System.out.println("Case Id: " + extractedCaseId); } scd.setCaseId(extractedCaseId); scd.setDecisionDate(extractedDecisionDate); scd.setHeardDate(null); scd.setParticipantsName(participantsString); } catch (Exception e) { System.out.println("Error in parsing html : " + e.getMessage()); } } return textContent; }
CourtDocument processCaseDetails(String caseFile, String sourceFileName) { int startPage = 1; int endPage = 1; LesothoCourtDocument scd = new LesothoCourtDocument( this.getCountryName(), this.getCourtName(), this.getProcessedUserName(), sourceFileName); String htmlFileContent = extractContentFromDocument(scd, caseFile, startPage, endPage); int filelength = caseFile.length(); this.setFileLength(filelength); List<String> pageContentList = new ArrayList<String>(); pageContentList.add(htmlFileContent); scd.extractCitations(this.getCourtName(), pageContentList); return scd; }