コード例 #1
0
  public static Set<Integer> extract(String fileName, String word) throws IOException {
    Set<Integer> ids = new HashSet<Integer>();
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4 * 1024 * 1024);
    String line = null;
    int lineCount = 0;
    System.out.println("Reading file...");
    while ((line = reader.readLine()) != null) {
      lineCount++;
      if (lineCount % 2000 == 0) System.out.print(".");
      if (lineCount % 100000 == 0) {
        System.out.println(lineCount);
      }
      try {
        if (!line.trim().startsWith("<row")) continue;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader =
            xmlInputFactory.createXMLEventReader(new StringReader(line));
        while (xmlEventReader.hasNext()) {
          XMLEvent xmlEvent = xmlEventReader.nextEvent();
          if (xmlEvent.isStartElement()) {
            StartElement startElement = xmlEvent.asStartElement();
            if (startElement.getName().getLocalPart().equalsIgnoreCase("row")) {
              int id = XMLUtil.getIntElement(startElement, "Id");

              String title = XMLUtil.getStringElement(startElement, "Title");
              if (title == null) {
                title = "";
                continue;
              }
              String tags = XMLUtil.getStringElement(startElement, "Tags");
              if ((tags == null) || (tags.trim().length() == 0)) {
                return null;
              }
              if (!SOUtil.hasJavaTag(tags)) continue;
              if (title.contains(word)) {
                ids.add(id);
              }
            }
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    return ids;
  }
コード例 #2
0
  public static List<String> getAllCode(Set<Integer> ids, String filePath, int speedUp)
      throws IOException, FactoryConfigurationError {
    List<String> code = new ArrayList<String>();
    BufferedReader reader = new BufferedReader(new FileReader(filePath), 4 * 1024 * 1024);
    String line = null;
    int lineCount = 0;
    System.out.println("Reading file...");
    int emptyPosts = 0;
    int codeAdded = 0;
    while ((line = reader.readLine()) != null) {
      lineCount++;
      if (lineCount % 2000 == 0) System.out.print(".");
      if (lineCount % 100000 == 0) {
        System.out.println(lineCount);
      }
      if ((speedUp > 0) && (lineCount % speedUp != 0)) continue;
      try {
        if (!line.trim().startsWith("<row")) continue;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader =
            xmlInputFactory.createXMLEventReader(new StringReader(line));
        Set<String> tempSet = new HashSet<String>();
        while (xmlEventReader.hasNext()) {
          XMLEvent xmlEvent = xmlEventReader.nextEvent();
          if (xmlEvent.isStartElement()) {
            StartElement startElement = xmlEvent.asStartElement();
            if (startElement.getName().getLocalPart().equalsIgnoreCase("row")) {
              int id = XMLUtil.getIntElement(startElement, "Id");
              int parentId = XMLUtil.getIntElement(startElement, "ParentId");
              boolean add = false;
              if (ids.contains(id)) add = true;
              if (!add) {
                if ((parentId > 0) && (ids.contains(parentId))) {
                  add = true;
                }
              }
              if (add) {

                String body = XMLUtil.getStringElement(startElement, "Body");
                Set<String> codeSet = SOUtil.getCodeSet(body);
                if (codeSet.size() == 0) {
                  emptyPosts++;
                }
                for (String codeFound : codeSet) {
                  String[] lines = codeFound.split("\r\n|\r|\n");
                  tempSet.clear();
                  for (String lineItem : lines) {

                    lineItem = lineItem.toLowerCase();
                    lineItem = lineItem.trim();

                    if (!CodeFragmentInspector.isJavaH(lineItem)) continue;

                    lineItem = StringUtil.cleanCode(lineItem);

                    if (lineItem.startsWith("//")) continue;
                    if (lineItem.length() <= 2) continue;
                    if (lineItem.startsWith("import ")) continue;
                    if (lineItem.startsWith("public")) continue;
                    if (lineItem.startsWith("private")) continue;
                    if (lineItem.startsWith("protected")) continue;
                    if (lineItem.startsWith("class")) continue;
                    List<String> tokens = CodeFragmentInspector.tokenizeAsList(lineItem);

                    for (String token : tokens) {
                      tempSet.add(token);
                    }
                  }
                  String newLineItem = "";
                  for (String token : tempSet) {
                    newLineItem = newLineItem + token + " ";
                  }

                  code.add(newLineItem.trim());
                  codeAdded++;
                }
              }
            }
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    System.out.println(emptyPosts + " empty posts found.");
    System.out.println(codeAdded + " snippets added.");
    return code;
  }