/** * to send Head request to the server to check for content type and content size * * @param current_url * @return */ public boolean is_valid_file(String current_url) throws IOException { // System.out.print("[Output from log4j] Checking validity for url +" + current_url); // create and send HEAD request // // // // System.out.println("[Output from log4j] before head request + " + current_url); this.response_headers = crawler_client.fetch_head_response_url(current_url); // // // // System.out.println("[Output from log4j] after head request + " + // response_headers.size()); // // // // System.out.println("[Output from log4j] After fetching response header in // is_valid_file"); // base case - no response headers recieved from the server if (this.response_headers == null) return false; // check for location header if (this.response_headers.containsKey("location")) { this.is_reloc = true; String re_loc = this.response_headers.get("location"); if (re_loc.startsWith("/")) { URL url_obj = new URL(current_url); String path = url_obj.getPath(); String abs_reloc; if (path.endsWith(".xml") || path.endsWith(".html") || path.endsWith("htm")) path = path.substring(0, path.lastIndexOf("/")); if (path.endsWith("/")) abs_reloc = url_obj.getProtocol() + "://" + url_obj.getHost() + path.substring(0, path.length() - 1) + re_loc; else abs_reloc = url_obj.getProtocol() + "://" + url_obj.getHost() + path + re_loc; System.err.println("[Output from log4j] Found Relocation url +" + abs_reloc); WebURLQueue queue = new WebURLQueue(); queue.addToQueue(abs_reloc); return false; } else { WebURLQueue queue = new WebURLQueue(); queue.addToQueue(re_loc); return false; } } // if content type is not present in response header if (!(this.response_headers.containsKey("content-type"))) return false; // check valid content types this.content_type = this.response_headers.get("content-type"); // if valid content type if (!content_type.equals("text/xml") && !content_type.equals("text/html") && !content_type.endsWith("+xml") && !content_type.equals("application/xml")) return false; // // // // System.out.println("[Output from log4j] Chheck till Content type"); // check content-length exists if (this.response_headers.containsKey("content-length")) { this.content_length = Double.parseDouble(this.response_headers.get("content-length")); // checking allowed content-length for the document if (this.content_length > (XPathCrawler.maxsize_doc * 1024 * 1024)) return false; } // // // // System.out.println("[Output from log4j] Chheck till Content Length"); return true; }