Example #1
0
  public static Document fromTaggerDocumentToDocument(TaggerDocument doc) {
    Document document = null;
    if (doc != null) {
      if (doc.getDoctype().equalsIgnoreCase(DocumentType.TWIITER_DOC)) {
        document = new Tweet();
      } else if (doc.getDoctype().equalsIgnoreCase(DocumentType.SMS_DOC)) {
        document = new SMS();
      } else if (doc.getDoctype().equalsIgnoreCase(DocumentType.FACEBOOK_DOC)) {
        document = new Facebook();
      }

      document.setDocumentID(doc.getDocumentID());
      document.setCrisisID(doc.getCrisisID());
      document.humanLabelCount = (doc.hasHumanLabels() == false) ? 0 : 1;
      document.setCrisisCode(doc.getCrisisCode());
      document.setLanguage(doc.getLanguage());

      WordSet wordSet = new WordSet();
      String text = doc.getWordFeatures();
      wordSet.addAll(FeatureExtractor.getWordsInStringWithBigrams(text, false));
      document.addFeatureSet(wordSet);

      document.setValueAsTrainingSample(doc.getValueAsTrainingSample());
      /*
      List<NominalLabelBC> labels = doc.getHumanLabels(NominalLabelBC.class);
      if (!labels.isEmpty()) {
      	for (NominalLabelBC label : labels) {
      		document.addLabel(label);
      	}
      }*/
    }
    return document;
  }
Example #2
0
  public static TaggerDocument fromDocumentToTaggerDocument(Document doc) {
    TaggerDocument document = new TaggerDocument();
    if (doc != null) {
      // NOTE: documentID needs to be set separately as Auto Generation ID from DB/Hibernate
      // Now copy the remaining fields
      document.setHasHumanLabels(doc.hasHumanLabels());
      document.setCrisisID(doc.getCrisisID());
      document.setCrisisCode(doc.getCrisisCode());
      document.setReceivedAt(
          new java.sql.Timestamp(java.util.Calendar.getInstance().getTimeInMillis()));
      document.setLanguage(doc.getLanguage());

      document.setDoctype(doc.getClass().getSimpleName().toString());
      if (doc.getInputJson() != null) {
        document.setData(Helpers.escapeJson(doc.getInputJson().toString()));
      } else {
        document.setData(null);
      }
      if (doc.features != null) {
        document.setWordFeatures(DocumentJSONConverter.getFeaturesJson(WordSet.class, doc));
      }
      document.setGeoFeatures(null);
      document.setValueAsTrainingSample(doc.getValueAsTrainingSample());
      boolean val = Math.random() < (1.0 / 5.0) ? true : false;
      document.setIsEvaluationSet(val);

      /*
      List<NominalLabelBC> labels = doc.getHumanLabels(NominalLabelBC.class);
      if (!labels.isEmpty()) {
      	List<NominalLabel> nbList = new ArrayList<NominalLabel>();
      	for (NominalLabelBC label : labels) {
      		NominalLabel nb = new NominalLabel(label.getNominalLabelID());
      		nbList.add(nb);
      	}
      	document.setNominalLabelCollection(nbList);
      } else {
      	document.setNominalLabelCollection(null);
      }*/

      return document;
    }
    return null;
  }