Implementing my Feature Generator |
There may be some cases in which the features that we provide are not enough for the problem you are trying to solve or you just want to try different features without much implementation effort. Next, we show how to create your own feature generator that produces a dependency graph (we will use Stanford NLP for that task). The first step for creating a feature generator is to prepare a structure that will contain the result of the generator. The objects returned by the feature generator must be of the type FeatureSet. In this case, our feature generator must return a dependency graph, which means that we must create am object called GraphFS: package edu.columbia.cs.ref.model.feature.impl;
import java.io.Serializable; import edu.columbia.cs.ref.model.core.structure.OperableStructure; import edu.columbia.cs.ref.model.feature.FeatureSet; import edu.columbia.cs.utils.SimpleGraph; //An object that is returned by a feature generator must be a FeatureSet public class GraphFS extends FeatureSet { //A GraphFS is composed by a Graph which is a representation of //a generic graph private SimpleGraph graph; //The constructor receives a graph public GraphFS(SimpleGraph graph){ this.graph=graph; } //Method to obtain the graph (in order for the relationship extraction //method to be able to retrieve it) public SimpleGraph getGraph(){ return graph; } @Override public String toString(){ return graph.toString(); } //Method that is used to enrich OperableStructures with new features //The OperableStructure must have a method add that receives a GraphFS @Override public void enrichMe(OperableStructure operableStructure) { operableStructure.add(this); } } Now that we have the structure that will store the result of the feature generator, we can actually create the class for the generator itself. package edu.columbia.cs.ref.algorithm.feature.generation.impl;
import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import opennlp.tools.util.InvalidFormatException; import edu.columbia.cs.ref.algorithm.feature.generation.CandidateSentenceFeatureGenerator; import edu.columbia.cs.ref.algorithm.feature.generation.FeatureGenerator; import edu.columbia.cs.ref.algorithm.feature.generation.SentenceFeatureGenerator; import edu.columbia.cs.ref.model.CandidateSentence; import edu.columbia.cs.ref.model.Sentence; import edu.columbia.cs.ref.model.feature.impl.GraphFS; import edu.columbia.cs.ref.model.feature.impl.SequenceFS; import edu.columbia.cs.utils.SimpleGraph; import edu.columbia.cs.utils.Span; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.trees.EnglishGrammaticalStructure; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TypedDependency; //The dependency parsing of a given sentence does not depend on the entities so this //feature should be the same for all the candidates which correspond to the same sentence. //Thus, it is a SentenceFeatureGenerator. public class StanfordNLPDependencyGraphFG extends SentenceFeatureGenerator private LexicalizedParser parser; private FeatureGenerator //The constructor receives the path to the parser model and a tokenizer which //will be used to produce the tokenization before the parsing public StanfordNLPDependencyGraphFG(String path, FeatureGenerator throws InvalidFormatException, IOException{ parser = new LexicalizedParser(path); this.tokenizer = tokenizer; } //This method just transforms a sequence of spans into words that are the //input of the Stanford NLP parser private List String value = sentence.getValue(); List for(int i=0; i tokens.add(new Word(value.substring(s.getStart(),s.getEnd()))); } return tokens; } //This method is used to define the feature generators that the StanfordNLPDependencyGraphFG //is dependent on. In this case it is only on the tokenizer given as input to //the constructor. @Override protected List ArrayList ret.add(tokenizer); return ret; } //The main method of this class. This method produces the dependecy parse of the sentence //given as input. @Override protected GraphFS SequenceFS tokenization = sentence.getFeatures(tokenizer); List parser.parse(tokens); Tree parsingTree = parser.getBestPCFGParse(); EnglishGrammaticalStructure struc = new EnglishGrammaticalStructure(parsingTree); Collection SimpleGraph int tokenizationSize=tokenization.size(); for(int i=0; i } for(TypedDependency dep : dependencies){ g.addEdge(dep.gov().index()-1, dep.dep().index()-1, dep.reln().getShortName()); } return new GraphFS } } |