Columbia University Framework for Relationship Extraction

Implementing my Feature Generator

There may be some cases in which the features that we provide are not enough for the problem you are trying to solve or you just want to try different features without much implementation effort. Next, we show how to create your own feature generator that produces a dependency graph (we will use Stanford NLP for that task).

The first step for creating a feature generator is to prepare a structure that will contain the result of the generator. The objects returned by the feature generator must be of the type FeatureSet. In this case, our feature generator must return a dependency graph, which means that we must create am object called GraphFS:

package edu.columbia.cs.ref.model.feature.impl;

import java.io.Serializable;

import edu.columbia.cs.ref.model.core.structure.OperableStructure;

import edu.columbia.cs.ref.model.feature.FeatureSet;

import edu.columbia.cs.utils.SimpleGraph;

//An object that is returned by a feature generator must be a FeatureSet 

public class GraphFS extends FeatureSet {

    //A GraphFS is composed by a Graph which is a representation of

    //a generic graph

    private SimpleGraph graph;

    //The constructor receives a graph

    public GraphFS(SimpleGraph graph){

        this.graph=graph;

    }

    //Method to obtain the graph (in order for the relationship extraction

    //method to be able to retrieve it) 

    public SimpleGraph getGraph(){

        return graph;

    }

    @Override

    public String toString(){

        return graph.toString();

    }

    //Method that is used to enrich OperableStructures with new features

    //The OperableStructure must have a method add that receives a GraphFS 

    @Override

    public void enrichMe(OperableStructure operableStructure) {

        operableStructure.add(this);

    }

}

Now that we have the structure that will store the result of the feature generator, we can actually create the class for the generator itself.

package edu.columbia.cs.ref.algorithm.feature.generation.impl;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collection;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import opennlp.tools.util.InvalidFormatException;

import edu.columbia.cs.ref.algorithm.feature.generation.CandidateSentenceFeatureGenerator;

import edu.columbia.cs.ref.algorithm.feature.generation.FeatureGenerator;

import edu.columbia.cs.ref.algorithm.feature.generation.SentenceFeatureGenerator;

import edu.columbia.cs.ref.model.CandidateSentence;

import edu.columbia.cs.ref.model.Sentence;

import edu.columbia.cs.ref.model.feature.impl.GraphFS;

import edu.columbia.cs.ref.model.feature.impl.SequenceFS;

import edu.columbia.cs.utils.SimpleGraph;

import edu.columbia.cs.utils.Span;

import edu.stanford.nlp.ling.Word;

import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

import edu.stanford.nlp.trees.EnglishGrammaticalStructure;

import edu.stanford.nlp.trees.Tree;

import edu.stanford.nlp.trees.TypedDependency;

//The dependency parsing of a given sentence does not depend on the entities so this

//feature should be the same for all the candidates which correspond to the same sentence.

//Thus, it is a SentenceFeatureGenerator. 

public class StanfordNLPDependencyGraphFG extends SentenceFeatureGenerator> {

    private LexicalizedParser parser;

    private FeatureGenerator> tokenizer;

    //The constructor receives the path to the parser model and a tokenizer which

    //will be used to produce the tokenization before the parsing 

    public StanfordNLPDependencyGraphFG(String path, FeatureGenerator> tokenizer)

            throws InvalidFormatException, IOException{

        parser = new LexicalizedParser(path);

        this.tokenizer = tokenizer;

    }

    //This method just transforms a sequence of spans into words that are the

    //input of the Stanford NLP parser

    private List getTokens(SequenceFS spans, Sentence sentence){

        String value = sentence.getValue();

        List tokens = new ArrayList();

        for(int i=0; i            Span s = spans.getElement(i);

            tokens.add(new Word(value.substring(s.getStart(),s.getEnd())));

        }

        return tokens;

    }

    //This method is used to define the feature generators that the StanfordNLPDependencyGraphFG

    //is dependent on. In this case it is only on the tokenizer given as input to

    //the constructor.

    @Override

    protected List retrieveRequiredFeatureGenerators() {

        ArrayList ret = new ArrayList();

        ret.add(tokenizer);

        return ret;

    }

    //The main method of this class. This method produces the dependecy parse of the sentence

    //given as input.

    @Override

    protected GraphFS extractFeatures(Sentence sentence) {

        SequenceFS tokenization = sentence.getFeatures(tokenizer);

        List tokens = getTokens(tokenization, sentence);

        parser.parse(tokens);

        Tree parsingTree = parser.getBestPCFGParse();

        EnglishGrammaticalStructure struc = new EnglishGrammaticalStructure(parsingTree);

        Collection dependencies = struc.allTypedDependencies();

        SimpleGraph g = new SimpleGraph(tokenization.size());

        int tokenizationSize=tokenization.size();

        for(int i=0; i            g.addNode(i, i);

        }

        for(TypedDependency dep : dependencies){

            g.addEdge(dep.gov().index()-1, dep.dep().index()-1, dep.reln().getShortName());

        }

        return new GraphFS(g);

    }

}