elements = Arrays.asList("hydrogen", "lithium", "sodium", "natrium", "potassium", "kalium", "rubidium", "caesium", "cesium", "francium", "beryllium", "magnesium", "calcium", "strontium", "barium", "radium", "aluminium", "aluminum", "gallium", "indium", "thallium", "tin", "stannum", "lead", "plumbum", "bismuth", "polonium", "scandium", "titanium", "vanadium", "chromium", "manganese", "iron", "cobalt", "nickel", "copper", "zinc", "yttrium", "zirconium", "niobium", "molybdenum", "technetium", "ruthenium", "rhodium", "palladium", "silver", "cadmium", "lanthanum", "cerium", "praseodymium", "neodymium", "promethium", "samarium", "europium", "gadolinium", "terbium", "dysprosium", "holmium", "erbium", "thulium", "ytterbium", "lutetium", "hafnium", "tantalum", "tungsten", "wolfram", "rhenium", "osmium", "iridium", "platinum", "gold", "mercury", "hydrargyrum", "actinium", "thorium", "protactinium", "uranium", "neptunium", "plutonium", "americium", "curium", "berkelium", "californium", "einsteinium", "fermium", "mendelevium", "nobelium", "lawrencium", "rutherfordium", "boron", "carbon", "silicon", "germanium", "nitrogen", "phosphorus", "arsenic", "antimony", "stibium", "oxygen", "sulfur", "selenium", "tellurium", "polonium", "fluorine", "chlorine", "bromine", "iodine", "astatine", "helium", "neon", "argon", "krypton", "xenon", "radon");
private static Pattern CONCAT_AMOUNT_PATTERN = Pattern.compile("[~]?\\d*\\.?(\\d(\\d+|\\.\\d+|\\d*[mk\u00b5u])(g|l|hPa)[s]?|(\\d+[mnk\u00b5u]?([LMN]|[eE][qQ][\\.]?|[cCdD][mM]3|[gG][rR][aA][mM][mM]?[eE]?|[mM][oO][lL][eE]?|[mM][oO][lL][aA][rR])[sS]?))$");
private static Pattern CONCAT_PH_PATTERN = Pattern.compile("^pH-?\\d+");
private static Pattern CONCAT_TEMP_PATTERN = Pattern.compile("\\d+(o|\u00b0|\u00ba)[cCfF][\\.]?");
private static Pattern CONCAT_PERCENTAGE_PATTERN = Pattern.compile("([^%]*)(%)([^%]*)");
/**************************
* Hides Utility Class Constructor.
*/
private Formatter(){
}
/************************************
* Normalises sentences e.g. replaces characters with equivalent characters
* also does some minor rearrangements
* @param sentence (String)
* @return newSentence (String)
*************************************/
public static String normaliseText(String sentence){
sentence = sentence.replace("\u2010", "-").replace("\u2011", "-").replace("\u2012", "-").replace("\u2013", "-").replace("\u2014", "-").replace("\u2015", "-").replace("\u002d", "-").replace("\u2212", "-");//normalise hyphens
sentence = sentence.replace("\u03BC", "\u00B5");//normalise mu to micro
sentence = MATCH_SULPH.matcher(sentence).replaceAll("sulf");//correct British spelling to the IUPAC spelling to assist OSCAR
sentence = MATCH_DEGREES_WHITESPACE.matcher(sentence).replaceAll("$2$1$3");//correct [degree symbol, space, temperature unit] to [space, degree symbol, temperature unit]
sentence = WHITESPACE_PATTERN.matcher(sentence).replaceAll(" ");
sentence = removeSpaceBetweenElementsAndChargeOrOxidationStateSpecifier(sentence);
return sentence;
}
/**
* e.g. palladium (II) --> palladium(II)
* @param sentence
* @return
*/
private static String removeSpaceBetweenElementsAndChargeOrOxidationStateSpecifier(String sentence) {
StringBuffer sb = new StringBuffer();
Matcher m = matchSpaceChargeOrOxidationSpecifier.matcher(sentence);
while (m.find()){
String beforeSpecifier = sentence.substring(0, m.start());
for (String chemicalElement : elements) {
if (StringTools.endsWithCaseInsensitive(beforeSpecifier, chemicalElement)){
m.appendReplacement(sb, m.group().substring(1));
break;
}
}
}
m.appendTail(sb);
return sb.toString();
}
/************************************
* Returns the list of tokens with tokens divided into further tokens in cases where this improves tagging
* e.g. {@code ['4.5kg'] --> ['4.5', 'kg'] }
*
* @param tokens (List)
* @return tokens (List)
*************************************/
public static List subTokeniseTokens(List tokens){
int i = 0;
while (i < tokens.size()) {
Token token = tokens.get(i);
String[] subTokens = subTokenize(token.getSurface());
if (subTokens!=null){
int start = token.getStart();
List newTokens = new ArrayList();
for (String newTokenSurface : subTokens) {
int end = start + newTokenSurface.length();
newTokens.add(new Token(newTokenSurface, start, end, token.getDoc(), token.getBioType(), token.getNeElem()));
start = end;
}
tokens.remove(i);
tokens.addAll(i, newTokens);
}
else{
i++;
}
}
int id =0;
for (Token token : tokens) {
token.setIndex(id++);
}
return tokens;
}
private static String[] subTokenize(String tokenSurface) {
if (tokenSurface.length() >1 ){
Matcher concatAmountMatcher = CONCAT_AMOUNT_PATTERN.matcher(tokenSurface);//split values from units e.g. 4.5g --> 4.5 g
if (concatAmountMatcher.matches()) {
return splitAmounts(tokenSurface);
}
Matcher concatPhMatcher = CONCAT_PH_PATTERN.matcher(tokenSurface);//e.g. pH7 --> pH 7
if (concatPhMatcher.find()) {
return new String[]{tokenSurface.substring(0, 2), tokenSurface.substring(2)};
}
Matcher concatTempMatcher = CONCAT_TEMP_PATTERN.matcher(tokenSurface);//e.g. 50oC --> 50 oC
if (concatTempMatcher.find()) {
int startOfDegreesSign = concatTempMatcher.start(1);
return new String[]{tokenSurface.substring(0, startOfDegreesSign), tokenSurface.substring(startOfDegreesSign)};
}
if (tokenSurface.contains("%")){
return splitPercentageSign(tokenSurface);
}
if (tokenSurface.length() >2 && tokenSurface.startsWith("(") && tokenSurface.endsWith(")")) {// splits brackets off a word enclosed by brackets. Needed to fix OSCAR not tokenising oxidation states
return new String[]{tokenSurface.substring(0,1), tokenSurface.substring(1, tokenSurface.length()-1), tokenSurface.substring(tokenSurface.length()-1)};
}
}
return null;
}
/************************************
* Returns the string split into value and unit
* @param amountString (String)
* @return valueUnitArray (String[])
*************************************/
private static String[] splitAmounts(String amountString) {
int splitIndex = amountString.length();
for (int i = 0; i < amountString.length(); i++) {
if (Character.isLetter(amountString.charAt(i))) {
splitIndex = i;
break;
}
}
return new String[]{amountString.substring(0, splitIndex), amountString.substring(splitIndex)};
}
/************************************
* Returns the string with percentage sign and percentage sign followed by hyphen split up
* @param amountString (String)
* @return valueUnitArray (String[])
*************************************/
private static String[] splitPercentageSign(String tokenSuface) {
List subTokens = new ArrayList();
Matcher concatPercentageMatcher = CONCAT_PERCENTAGE_PATTERN.matcher(tokenSuface);
while (concatPercentageMatcher.find()){
if (concatPercentageMatcher.group(1).length() > 0){
subTokens.add(concatPercentageMatcher.group(1));
}
subTokens.add(concatPercentageMatcher.group(2));
String afterPercentageSign = concatPercentageMatcher.group(3);
if (afterPercentageSign.length() > 0){
if (afterPercentageSign.startsWith("-") && afterPercentageSign.length() >1){
subTokens.add("-");
subTokens.add(afterPercentageSign.substring(1));
}
else {
subTokens.add(afterPercentageSign);
}
}
}
return subTokens.toArray(new String[subTokens.size()]);
}
}
OpenNLPTagger.java 0000664 0000000 0000000 00000007722 14163277713 0033306 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/*****************************************************
* Runs the OpenNLP tagger .
*
* @author lh359, dmj30,jat45,dl387
*****************************************************/
public class OpenNLPTagger implements Tagger{
/**************************************
* Private Singleton holder.
***************************************/
private static class INSTANCE_HOLDER {
private static OpenNLPTagger myInstance = new OpenNLPTagger();
}
private final POSModel posModel;
/**************************************
* Private Constructor Class.
***************************************/
private OpenNLPTagger() {
InputStream is = OpenNLPTagger.class.getResourceAsStream("openNLPTagger/en-pos-maxent.bin");
try {
posModel = new POSModel(is);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**************************************
* Returns an OpenNLPTagger singleton.
* @return OpenNLPTaggerInstance.
***************************************/
public static OpenNLPTagger getInstance() {
return INSTANCE_HOLDER.myInstance;
}
/**************************************
* Getter method for posTagger.
*
* @return posTagger(PosTagger).
***************************************/
public POSTagger getTagger() {
return new POSTaggerME(posModel);
}
/*****************************************************
* Runs the OpenNLP POS tagger against a list of tokens and returns a list of tags
* @param tokenList (List)
* @param inputSentence (String)
* @return tagList (List)
*****************************************************/
public List runTagger(List tokenList, String inputSentence) {
int tokenCount = tokenList.size();
String[] tokens = new String[tokenCount];
for (int i = 0; i < tokenCount; i++) {
tokens[i] = tokenList.get(i).getSurface();
}
String[] tags = tag(tokens);
List tagList = createPosTagListFromStringArray(tags);
return tagList;
}
public String[] tag(String[] sentenceTokens){
POSTaggerME posTagger = new POSTaggerME(posModel);
return posTagger.tag(sentenceTokens);
}
/**************************************
* Creates the posTagList from the openNLP string Array format.
*
* @param posTags (String[])
***************************************/
public List createPosTagListFromStringArray(String[] posTags) {
List posTagList = new ArrayList();
for (String posTag : posTags) {
if (StringUtils.isEmpty(posTag)) {
posTagList.add("NN");
} else if (posTag.equals(".")) {
posTagList.add("STOP");
} else if (posTag.equals(",")) {
posTagList.add("COMMA");
} else if (posTag.equals(":")) {
posTagList.add("COLON");
} else if (posTag.equals("#")) {
posTagList.add("NN");
} else if (posTag.equals("``")) {
posTagList.add("FW");
} else if (posTag.equals("$")) {//used for currency symbols
posTagList.add("FW");
} else {
posTagList.add(posTag);
}
}
return posTagList;
}
public List getIgnoredTags() {
return null;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/OscarTagger.java 0000664 0000000 0000000 00000006346 14163277713 0033162 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscarMEMM.MEMMRecogniser;
/*****************************************************
* Runs the OSCAR tagger .
*
* @author lh359, dmj30
*****************************************************/
public class OscarTagger implements Tagger {
protected Oscar oscar;
/*****************************
* Default constructor method.
* @param oscar (Oscar)
***************************/
public OscarTagger(Oscar oscar) {
this.oscar = oscar;
MEMMRecogniser recogniser = new MEMMRecogniser();
recogniser.setDeprioritiseOnts(true);
recogniser.setCprPseudoConfidence(0);
recogniser.setOntPseudoConfidence(0);
oscar.setRecogniser(recogniser);
}
/***********************************************
* Runs OSCAR over a list of tokens and returns a list of tags
* @param tokenList (List)
* @return tagList (List)
***********************************************/
public List runTagger(List tokenList, String inputSentence) {
List tokenSequences = Arrays.asList(generateOscarTokenSequence(tokenList, inputSentence));
List neList = oscar.recogniseNamedEntities(tokenSequences);
List ignoreOscarList = Arrays.asList("cpr", "ont");
List tagList = new ArrayList();
String nilTag = "nil";
for (int i = 0; i < tokenList.size(); i++) {
tagList.add(nilTag);
}
for (NamedEntity ne : neList) {
if (!ignoreOscarList.contains(ne.getType().getName().toLowerCase())) {
List tokens = ne.getTokens();
for (Token token : tokens) {
tagList.set(token.getIndex(), "OSCAR-"+ne.getType().getName());
}
}
}
return tagList;
}
public List getIgnoredTags() {
return null;
}
/*********************************************
* Generates an OSCAR TokenSequence from a list of tokens
* @param oscarTokens (List)
* @param inputText (String)
* @return tokenSequence (TokenSequence)
********************************************/
protected TokenSequence generateOscarTokenSequence(List oscarTokens, String inputText) {
TokenSequence tokSeq = new TokenSequence(inputText, 0, null, oscarTokens);
int id = 0;
for (Token token : tokSeq.getTokens()) {
token.setTokenSequence(tokSeq);
token.setIndex(id++);
}
return tokSeq;
}
}
OscarTokeniser.java 0000664 0000000 0000000 00000002636 14163277713 0033633 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.List;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscartokeniser.Tokeniser;
public class OscarTokeniser implements ChemicalTaggerTokeniser {
private Tokeniser oscarTokeniser;
/*********************
* Public Constructor method.
*/
public OscarTokeniser() {
this.oscarTokeniser = Tokeniser.getDefaultInstance();
}
/*****************************************************
* Tokenises input text using the OSCAR4 tokeniser.
* Returns a list of tokens
* @param inputSentence (String)
* @return List
*****************************************************/
public List tokenise(String inputSentence) {
return oscarTokeniser.tokenise(inputSentence).getTokens();
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/POSContainer.java0000664 0000000 0000000 00000013327 14163277713 0033262 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.List;
import nu.xom.Element;
import org.apache.commons.lang.StringUtils;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/********************************************
* A container class that stores the grammatical structure of the text .
*
* @author lh359
********************************************/
public class POSContainer {
private static String SPACE = " ";
private String inputText;
private boolean prioritiseOscar = true;
private List wordTokenList = new ArrayList();
private List combinedTagsList = new ArrayList();
private List> tagListContainer = new ArrayList>();
public List> getTagListContainer() {
return tagListContainer;
}
private Element spectrumElementList;
/******************************
* Default constructor method.
******************************/
public POSContainer() {
}
/**************************************
* Setter method for InputText.
*
* @param inputText (String)
***************************************/
public void setInputText(String inputText) {
this.inputText = inputText;
}
/**************************************
* Getter method for InputText.
*
* @return inputText (String)
**************************************/
public String getInputText() {
return inputText;
}
/**************************************
* Setter method for prioritiseOscar.
*
* @param prioritiseOscar (boolean)
***************************************/
public void setPrioritiseOscar(boolean prioritiseOscar) {
this.prioritiseOscar = prioritiseOscar;
}
/**************************************
* Getter method for prioritiseOscar.
*
* @return prioritiseOscart (boolean)
***************************************/
public boolean getPrioritiseOscar() {
return prioritiseOscar;
}
/**************************************
* Setter method for WordTokenList.
*
* @param wordTokenList (List)
***************************************/
public void setWordTokenList(List wordTokenList) {
this.wordTokenList = wordTokenList;
}
/**************************************
* Getter method for WordTokenList.
*
* @return wordTokenList (List)
***************************************/
public List getWordTokenList() {
return wordTokenList;
}
/**************************************
* Setter method for SpectrumList.
*
* @param spectrumList (List)
***************************************/
public void setSpectrumList(List spectrumList) {
spectrumElementList = new Element("SpectrumList");
for (Element element : spectrumList) {
spectrumElementList.appendChild(element);
}
}
/**************************************
* Getter method for SpectrumList.
*
* @return spectrumList (List)
***************************************/
public Element getSpectrumElementList() {
return spectrumElementList;
}
/**************************************
* Setter method for CombinedTagsList.
*
* @param combinedTagsList (List)
***************************************/
public void setCombinedTagsList(List combinedTagsList) {
this.combinedTagsList = combinedTagsList;
}
/**************************************
* Getter method for CombinedTagsList.
*
* @return combinedTagsList (List)
***************************************/
public List getCombinedTagsList() {
return combinedTagsList;
}
/**************************************
* Adds token to wordTokenList.
*
* @param token (Token)
***************************************/
public void addToTokenList(Token token) {
wordTokenList.add(token);
}
/***************************************
* Combines the output of all the taggers.
***************************************/
public void combineTaggers() {
int size = tagListContainer.get(0).size();
for (int i = 0; i < size; i++) {
for (List tagList : tagListContainer) {
if (tagList.get(i).equals("nil")) {
continue;
}
else {
combinedTagsList.add(tagList.get(i));
break;
}
}
}
}
/*********************************************
* Gets the output of the string and tag as: "TAG TOKEN".
*
* @return tokenTagTupleString (String).
*********************************************/
public String getTokenTagTupleAsString() {
StringBuilder tokenTagTupleString = new StringBuilder();
for (int i = 0; i < wordTokenList.size(); i++) {
if (StringUtils.isNotEmpty(combinedTagsList.get(i))
&& StringUtils.isNotEmpty(wordTokenList.get(i).getSurface())) {
tokenTagTupleString.append(combinedTagsList.get(i));
tokenTagTupleString.append(SPACE);
tokenTagTupleString.append(wordTokenList.get(i).getSurface());
tokenTagTupleString.append(SPACE);
}
}
return tokenTagTupleString.toString().trim();
}
/********************************************
* Adds tagLists to the tagListContainer
* @param tagList
*/
public void registerTagList(List tagList) {
tagListContainer.add(tagList);
}
}
PostProcessTags.java 0000664 0000000 0000000 00000055501 14163277713 0034002 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/**************************************************
* Corrects Mistagged tokens.
*
* @author lh359, dl387
**************************************************/
public class PostProcessTags {
private static HashSet tagSet = Utils.loadsTagsFromFile(PostProcessTags.class);
private final POSContainer posContainer;
private final List tokenList;
private final List combinedTags;
/**
* Corrects Mistagged tokens within the given POSContainer
* @param posContainer
*/
public PostProcessTags(POSContainer posContainer) {
this.posContainer = posContainer;
tokenList = posContainer.getWordTokenList();
combinedTags = posContainer.getCombinedTagsList();
}
/********************************************
* Corrects known tagging problems in the tags in combinedtagsList.
*******************************************/
public void correctCombinedTagsList() {
correctCombinedTagsList( new ArrayList());
}
/***********************************************
* Corrects known tagging problems in the tags in combinedtagsList.
* Tags in the ignoredTags list will never be created
* (used when called form other applications)
* @param ignoredTags (List)
**********************************************/
public void correctCombinedTagsList(List ignoredTags) {
List newTokenList = new ArrayList();
List newCombinedTagsList = new ArrayList();
for (int i = 0; i < combinedTags.size(); i++) {
String currentTag = combinedTags.get(i);
Token currentToken = tokenList.get(i);
String newTag = combinedTags.get(i);
String currentTokenStr = currentToken.getSurface();
newTag = correctMisTaggedNouns(i, currentTag, currentTokenStr);
if (newTag.equals(currentTag)){
newTag = correctMisTaggedVerbs(i, currentTag, currentTokenStr);
}
if (newTag.equals(currentTag)){
newTag = correctMisTaggedDigits(i, currentTag, currentTokenStr);
}
if (newTag.equals(currentTag)){
newTag = correctMisTaggedUnits(i, currentTag, currentTokenStr);
}
if (newTag.equals(currentTag)){
newTag = correctMisTaggedMisc(i, currentTag, currentTokenStr);
}
if (tagSet.contains(currentTokenStr)) {
currentToken.setSurface(currentTokenStr.toLowerCase());
}
if (!ignoredTags.contains(newTag)) {
newCombinedTagsList.add(newTag);
}
else {
newCombinedTagsList.add(currentTag);
}
newTokenList.add(currentTokenStr);
}
posContainer.setCombinedTagsList(newCombinedTagsList);
}
/******************************************
* Corrects the mistagged nouns.
* @param i
* @param currentTag
* @param currentTokenStr
* @return
*/
private String correctMisTaggedNouns(int i, String currentTag, String currentTokenStr) {
String currentTagLC = currentTag.toLowerCase();
if (currentTagLC.startsWith("nn-mixture")) {
List afterList = Arrays.asList("in-of");
if (!stringAfter(afterList, i, combinedTags)) {
return "NN-CHEMENTITY";
}
}
if (currentTokenStr.equalsIgnoreCase("acid") || currentTokenStr.equalsIgnoreCase("base")){
if (currentTag.startsWith("NN")){
return "NN-CHEMENTITY";
}
}
if (currentTokenStr.equalsIgnoreCase("formula")){
List afterList = Arrays.asList("cd", "cd-alphanum", "nn-identifier");
if (stringAfter(afterList, i, combinedTags)){
return "NN-CHEMENTITY";
}
}
List colours = Arrays.asList("amber", "bronze", "cream", "fawn", "gold", "ivory", "lavender", "tan");
if (colours.contains(currentTokenStr.toLowerCase())){
if (stringAfter(Arrays.asList("nn-state"), i, combinedTags)){
return "JJ";
}
}
return currentTag;
}
/*************************************
* Corrects the mistagged verbs.
*
* @param i (Integer)
* @param currentTag (String)
* @param currentTokenStr (String)
* @return
*************************************/
private String correctMisTaggedVerbs(int i, String currentTag, String currentTokenStr) {
String currentTagLC = currentTag.toLowerCase();
if (currentTokenStr.equalsIgnoreCase("yield") ) {
//Disambiguates between yield as a verb and the yield of a product compound
List beforeList = Arrays.asList("nn-percent");//e.g. 30% yield
List afterList = Arrays.asList("in-of", "colon");//e.g. yield of 30% /yield :30%
if (stringBefore(beforeList, i, combinedTags) || ( stringAfter(afterList, i, combinedTags))) {
return "NN-YIELD";
}
afterList = Arrays.asList("nn-chementity");
beforeList = Arrays.asList("dt", "dt-the");
if (stringAfter(afterList, i, combinedTags) && stringBefore(beforeList, i, combinedTags)){
return "JJ-COMPOUND";
}
}
if (currentTagLC.startsWith("vb") && currentTokenStr.equalsIgnoreCase("form")) {//"form" is only a VB-YIELD if it is a verb
return "VB-YIELD";
}
if (currentTagLC.startsWith("vb-filter")) {
List beforeList = Arrays.asList("oscar-cj", "jj-chem");
if (stringBefore(beforeList, i, combinedTags)) {
return "NN";
}
}
if (endsWithCaseInsensitive(currentTokenStr, "dilute")//correct cases where dilute and the like are actually adjectives e.g. dilute sulfuric acid
|| endsWithCaseInsensitive(currentTokenStr, "diluted")
|| endsWithCaseInsensitive(currentTokenStr, "concentrated")
|| endsWithCaseInsensitive(currentTokenStr, "dry")
|| endsWithCaseInsensitive(currentTokenStr, "dried")){
List afterList = Arrays.asList("oscar-cm", "nn-chementity", "oscar-cj", "jj-chem");
if (stringAfter(afterList, i, combinedTags)) {
return "JJ-CHEM";
}
}
if (currentTagLC.startsWith("vb-") || //TODO what does this actually do other than correcting reaction mixture?
(currentTagLC.startsWith("nn") && !currentTagLC.startsWith("nn-state")
&& !currentTagLC.startsWith("nn-apparatus")
&& !currentTagLC.startsWith("nn-cycle")
&& !currentTagLC.startsWith("nn-temp")
&& !currentTagLC.startsWith("nn-pressure")
&& !currentTagLC.startsWith("nn-time")
&& !currentTagLC.startsWith("nn-molar")
&& !currentTagLC.startsWith("nn-vacuum")
&& !currentTagLC.startsWith("nnp"))) {
List beforeList = Arrays.asList("dt", "jj", "jj-chem", "dt-the");
List afterListJJ = Arrays.asList("jj", "nn-chementity", "nn-mixture", "nn-apparatus", "nn", "jj-chem");
List afterListNN = Arrays.asList("stop", "comma");
if (stringBefore(beforeList, i, combinedTags)
&& (i == combinedTags.size() || stringAfter(
afterListNN, i, combinedTags))
&& !currentTagLC.startsWith("nn-")) {
return "NN";
}
else if (stringBefore(beforeList, i, combinedTags)
&& (stringAfter(afterListJJ, i, combinedTags) &&
!currentTagLC.startsWith("nn-chementity")) && adjObjectExists(i)) {
return "JJ-CHEM";
}
}
if (currentTagLC.startsWith("vb-")
|| currentTagLC.startsWith("nn-synthesize")) {
List beforeList = Arrays.asList("dt-the", "dt");
List afterList = Arrays.asList("vb");
if (stringBefore(beforeList, i, combinedTags)
&& (stringAfterStartsWith(afterList, i, combinedTags))) {
return "NN-CHEMENTITY";
}
}
if (currentTagLC.startsWith("vb")
&& Utils.containsNumber(currentTokenStr)) {//verbs are highly unlikely to contain numbers
return "NN";
}
if (currentTagLC.startsWith("vbn")
|| currentTagLC.startsWith("vbg")
|| currentTagLC.startsWith("vb-")){
List afterList = Arrays.asList("oscar-cm", "nns", "nn-chementity", "oscar-cj", "jj-chem", "nnp");
List beforeList = Arrays.asList("dt", "rb", "rb-conj", "dt-the", "stop", "in-with", "in-of", "in-under");
if (stringAfter(afterList, i, combinedTags)
&& stringBefore(beforeList, i, combinedTags)) {
return "JJ-CHEM";
}
}
if (currentTagLC.startsWith("vb")) {
List beforeList = Arrays.asList("to");
List beforebeforeList = Arrays.asList("vb-heat");
List afterList = Arrays.asList("stop");
if (stringBefore(beforeList, i, combinedTags)
&& stringBefore(beforebeforeList, i - 1, combinedTags)
&& stringAfter(afterList, i, combinedTags)) {
return "NN";
}
}
/********
* Gerunds
*/
if (currentTagLC.startsWith("vb")
&& (currentTokenStr.toLowerCase().endsWith("ing") || currentTokenStr
.toLowerCase().endsWith("ed"))) {
List afterList = Arrays.asList("nn", "oscar-cm", "nns", "nn-chementity", "oscar-cj", "jj-chem", "jj", "nnp", "nn-state", "nn-apparatus");
List beforeList = Arrays.asList("dt", "dt-the", "cd", "oscar-cm");
List notList = Arrays.asList("in-of");
if(stringAfter(Arrays.asList("oscar-cm"), i, combinedTags)
&& stringBefore(Arrays.asList("oscar-cm", "nn-chementity"), i, combinedTags)){
//special case to avoid a few known mistags. This function should probably be reduced in scope
}
else if (stringAfter(afterList, i, combinedTags) && stringBefore(beforeList, i, combinedTags)) {
return "JJ";
}
else if (currentTokenStr.toLowerCase().endsWith("ing") && stringBefore(beforeList, i, combinedTags) && !stringAfter(notList, i, combinedTags)) {
return "JJ-CHEM";
}
}
if (currentTagLC.startsWith("vb") && !currentTokenStr.toLowerCase().endsWith("ing")) {
List beforeList = Arrays.asList("dt", "dt-the", "in-in", "in-of", "rb");
List afterList = Arrays.asList("nn", "oscar-cm", "nns", "nn-chementity", "oscar-cj", "jj-chem", "jj", "nnp", "nn-state", "nn-apparatus");
List chemafterList = Arrays.asList("oscar-cm", "nn-chementity", "oscar-cj", "jj-chem");
if (i != 0) {
if (!tokenList.get(i - 1).getSurface().equals("that")) {
if (stringAfter(chemafterList, i, combinedTags)
&& stringBefore(beforeList, i, combinedTags)) {
return "JJ-CHEM";
}
else if (stringBefore(beforeList, i, combinedTags)
&& stringAfter(afterList, i, combinedTags)) {
return "JJ";
} else if (stringBefore(beforeList, i, combinedTags)
&& (i +1) < combinedTags.size() && combinedTags.get(i + 1).toLowerCase()
.startsWith("nn")) {
return "JJ";
}
}
}
}
return currentTag;
}
/******************************************
* Correct one character letters that are intended to be units
* @param i
* @param currentTag
* @param currentTokenStr
* @return
*/
private String correctMisTaggedUnits(int i, String currentTag, String currentTokenStr) {
List afterList = Arrays.asList("sym");
if ((currentTokenStr.length() == 1) && Character.isLowerCase(currentTokenStr.charAt(0)) && stringAfter(afterList, i, combinedTags)){
return "NN";
}
return currentTag;
}
/*************************************
* Corrects tagging of words intended to be CDs or CD-ALPHANUMs
*
* @param i (Integer)
* @param currentTag (String)
* @param currentTokenStr (String)
* @return
*************************************/
private String correctMisTaggedDigits(int i, String currentTag, String currentTokenStr) {
String currentTagLC = currentTag.toLowerCase();
if ((currentTagLC.startsWith("nn-") && Utils
.containsNumber(currentTokenStr))) {
List beforeList = Arrays.asList("in-of", "jj", "nn-chementity", "comma");
List afterList = Arrays.asList("-lrb-", "stop", "comma");
if (stringBefore(beforeList, i, combinedTags)
&& (stringAfter(afterList, i, combinedTags) || i == combinedTags
.size())) {
return "CD-ALPHANUM";
}
}
if (currentTagLC.equals("cd-alphanum")) {
List afterList = Arrays.asList("nn-vol", "nn-mass");
if (stringAfter(afterList, i, combinedTags)
|| currentTokenStr.contains(".") || currentTokenStr.length() > 4) {
return "CD";
}
}
return currentTag;
}
/*************************************
* Corrects the mistagged verbs.
*
* @param i (Integer)
* @param currentTag (String)
* @param currentTokenStr (String)
* @return
*************************************/
private String correctMisTaggedMisc(int i, String currentTag, String currentTokenStr) {
String currentTagLC = currentTag.toLowerCase();
if (currentTagLC.equals("nnp")
&& StringUtils.equalsIgnoreCase(currentTokenStr, "M")) {
return "NN-MOLAR";
}
if (i != 0 && currentTagLC.equals("nns")) {
List beforeList = Arrays.asList("stop");
if (currentTokenStr.endsWith("s")
&& Character.isUpperCase(currentTokenStr.charAt(0))) {
if (!stringBefore(beforeList, i, combinedTags)){
return "NNPS";
}
}
}
if (currentTagLC.equals("rb") && currentTokenStr.length() < 2) {
if (Character.isUpperCase(currentTokenStr.charAt(0)) ){
return "NNP";
}
else {
return "NN";
}
}
if (currentTokenStr.equals("M")) {
List beforeList = Arrays.asList("cd");
if (stringBefore(beforeList, i, combinedTags)) {
return "NN-MOLAR";
}
}
if (currentTokenStr.equals("K")) {
List beforeList = Arrays.asList("cd");
if (stringBefore(beforeList, i, combinedTags)) {
return "NN-TEMP";
}
}
if (currentTagLC.equals("''")) {
return "FW";
}
if (currentTagLC.equals("nn-mixture")) {
List afterList = Arrays.asList("vbd");
if (stringAfter(afterList, i, combinedTags)) {
return "NN-CHEMENTITY";
}
}
if (currentTagLC.startsWith("nn-concentrate")) {
List beforeList = Arrays.asList("jj", "oscar-cj", "jj-chem");
List afterList = Arrays.asList("in-of");
if (!stringBefore(beforeList, i, combinedTags)
&& (stringAfter(afterList, i, combinedTags))) {
return "NN";
}
}
if (i != 0 && currentTagLC.startsWith("nn-add")) {
List beforeList = Arrays.asList("stop", "comma", "colon");
if (!stringBefore(beforeList, i, combinedTags)&& Character.isUpperCase(currentTokenStr.charAt(0))) {
return "NNP";
}
}
if (currentTagLC.startsWith("jj")
|| currentTagLC.startsWith("nnp")) {
List afterList = Arrays.asList("nn-campaign");
if ((stringAfter(afterList, i, combinedTags) || string2After(
afterList, i, combinedTags))
&& Character.isUpperCase(currentTokenStr.charAt(0))) {
return "NNP";
}
}
if (currentTokenStr.toLowerCase().equals("addition")) {
List beforeList = Arrays.asList("in-in");
List afterList = Arrays.asList("comma", "stop");
if (stringBefore(beforeList, i, combinedTags)
&& (stringAfter(afterList, i, combinedTags))) {
return "NN";
}
}
if (currentTokenStr.toLowerCase().startsWith("obtain")) {
List afterList = Arrays.asList("in-from");
List after2List = Arrays.asList("nnp");
if (stringAfter(afterList, i, combinedTags)
&& (string2After(after2List, i, combinedTags))) {
return "VB";
}
}
if (currentTagLC.startsWith("nn-synthesize")) {
List afterList = Arrays.asList("nn-apparatus");
List beforeList = Arrays.asList("dt", "nn-apparatus", "rb-conj", "dt-the");
if (stringAfter(afterList, i, combinedTags)
&& stringBefore(beforeList, i, combinedTags)) {
return "JJ-CHEM";
}
}
if (currentTokenStr.equals("D")
&& currentTagLC.equals("nn-time")) {
List beforeList = Arrays.asList("in-in");
if (stringBefore(beforeList, i, combinedTags)) {
return "NN";
}
}
//Identifies a capital letter or single character roman number that is likely to be an identifier
if (currentTokenStr.length()==1 && Character.isLetter(currentTokenStr.charAt(0))){
char charac = currentTokenStr.charAt(0);
List beforeBracket = Arrays.asList("-lrb-");
List afterBracket = Arrays.asList("-rrb-");
if ((stringBefore(beforeBracket, i, combinedTags) || i==0) && stringAfter(afterBracket, i, combinedTags)
|| (i==0 && tokenAfter(Arrays.asList(".", ":"), i))){
//could be an abbreviation
if ((charac =='d' || charac =='D' || charac =='h' || charac =='s') && string2Before(Arrays.asList("nn-time"), i, combinedTags)){
return "NN-TIME";
}
else if (charac =='g' && string2Before(Arrays.asList("nn-mass"), i, combinedTags)){
return "NN-MASS";
}
else if (charac =='K' && string2Before(Arrays.asList("nn-temp"), i, combinedTags)){
return "NN-TEMP";
}
else if ((charac =='l' || charac =='L') && string2Before(Arrays.asList("nn-vol"), i, combinedTags)){
return "NN-VOL";
}
else if ((charac =='g' || charac =='l' || charac =='s') && string2Before(Arrays.asList("oscar-cm"), i, combinedTags)){
return "NN-STATE";//state symbol i.e. gas/liquid/solid
}
else{
return "NN-IDENTIFIER";
}
}
List beforeList = Arrays.asList("nn-example", "nn-method", "nn-chementity", "in-of" );
if (stringBefore(beforeList, i, combinedTags) && !isEnglishUseOfAorI(charac, i) ){
return "NN-IDENTIFIER";
}
}
if (currentTokenStr.equalsIgnoreCase("precipitate")){
if (currentTagLC.startsWith("nn")){
return "NN-CHEMENTITY";
}
else if (currentTagLC.startsWith("vb")){
return "VB-PRECIPITATE";
}
//Precipitate can also, rarely, be an adjective
}
if (i != 0 && currentTagLC.equals("nn")) {
List beforeList = Arrays.asList("stop");
if ((Character.isUpperCase(currentTokenStr.charAt(0)) && !stringBefore(beforeList, i, combinedTags)) || (!currentTokenStr.toLowerCase().equals(currentTokenStr))){
return "NNP";
}
}
return currentTag;
}
/**
* Does this appear to be a valid use of a/A/I
* @param charac
* @param i
* @return
*/
private boolean isEnglishUseOfAorI(char charac, int i) {
List stopOrColon = Arrays.asList("stop", "colon");
if (charac =='A' || charac =='I'){
if (i==0 || stringBefore(stopOrColon, i, combinedTags)){
return true;
}
}
else if (charac =='a'){
if (i!=0 && !stringBefore(stopOrColon, i, combinedTags) && !stringAfter(stopOrColon, i, combinedTags)){
return true;
}
}
return false;
}
/******************************************
* Checks for if the adjective is describing an object.
* Checks if a noun tag exists before a preposition. If not then the current word is a noun and not an adjective
* @param index
* @return boolean
*/
private boolean adjObjectExists(int index) {
for (int i = index+1; i < combinedTags.size(); i++) {
if (combinedTags.get(i).toLowerCase().startsWith("to") || combinedTags.get(i).toLowerCase().startsWith("in")){
return false;
}
else if (combinedTags.get(i).toLowerCase().startsWith("nn")){
return true;
}
}
return false;
}
/**********************************
* A boolean function that checks for the token after the current token.
*
* @param afterList (List)
* @param index (Integer)
* @return boolean
**********************************/
private boolean tokenAfter(List afterList, int index) {
int afterIndex = index + 1;
if (afterIndex < tokenList.size()) {
if (afterList.contains(tokenList.get(afterIndex).getSurface().toLowerCase())) {
return true;
}
}
return false;
}
/***********************************
* A boolean function that checks for the token before the current token.
*
* @param beforeList (List)
* @param index (Integer)
* @param combinedTags (List)
* @return boolean
***********************************/
private boolean stringBefore(List beforeList, int index,
List combinedTags) {
if (index != 0) {
int beforeIndex = index - 1;
if (beforeList.contains(combinedTags.get(beforeIndex).toLowerCase())) {
return true;
}
}
return false;
}
/***********************************
* A boolean function that checks for the token two before the current token.
*
* @param beforeList (List)
* @param index (Integer)
* @param combinedTags (List)
* @return boolean
***********************************/
private boolean string2Before(List beforeList, int index,
List combinedTags) {
if (index > 1) {
int before2Index = index - 2;
if (beforeList.contains(combinedTags.get(before2Index).toLowerCase())) {
return true;
}
}
return false;
}
/**********************************
* A boolean function that checks for the token after the current token.
*
* @param afterList (List)
* @param index (Integer)
* @param combinedTags (List)
* @return boolean
**********************************/
private boolean stringAfter(List afterList, int index,
List combinedTags) {
int afterIndex = index + 1;
if (afterIndex < combinedTags.size()) {
if (afterList.contains(combinedTags.get(afterIndex).toLowerCase())) {
return true;
}
}
return false;
}
/**********************************
* A boolean function that checks for what the token after the current
* token starts with.
*
* @param afterList (List)
* @param index (Integer)
* @param combinedTags (List)
* @return boolean
**********************************/
private boolean stringAfterStartsWith(List afterList, int index,
List combinedTags) {
int afterIndex = index + 1;
if (afterIndex < combinedTags.size()) {
for (String nextWord : afterList) {
if (combinedTags.get(afterIndex).toLowerCase().startsWith(nextWord)) {
return true;
}
}
}
return false;
}
/**********************************
* A boolean function that checks for the token two after the current token.
*
* @param afterList (List)
* @param index (Integer)
* @param combinedTags (List)
* @return boolean
**********************************/
private boolean string2After(List afterList, int index,
List combinedTags) {
int after2Index = index + 2;
if (after2Index < combinedTags.size()) {
if (afterList.contains(combinedTags.get(after2Index).toLowerCase())) {
return true;
}
}
return false;
}
/**
* Tests if this string ends with the specified suffix ignoring case.
* @param str
* @param suffix
* @return
*/
private boolean endsWithCaseInsensitive(String str, String suffix) {
if (suffix.length() > str.length()) {
return false;
}
int strOffset = str.length() - suffix.length();
return str.regionMatches(true, strOffset, suffix, 0, suffix.length());
}
} PostProcessTrees.java 0000664 0000000 0000000 00000056377 14163277713 0034202 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import nu.xom.Attribute;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Elements;
import nu.xom.Node;
import nu.xom.Nodes;
/***********************************************
* Passes tagged sentences to the ANTLR grammar. And converts the output to an
* XML document.
*
* @author lh359, dl387
*****************************************/
public class PostProcessTrees {
private HashMap actionMap = new HashMap();
private static List splitList = Arrays.asList("comma", "cc", "stop", "colon", "rb-conj");
private boolean includeWaitPhrase = true;
/********************************
* Default Constructor method.
********************************/
public PostProcessTrees() {
actionMap = new HashMap();
}
/********************************
* Getter method for actionMap.
*
* @return actionMap (HashMap)
********************************/
public HashMap getActionMap() {
return actionMap;
}
/********************************
* Getter method for actionMap.
*
* @param actionMap
* (HashMap)
********************************/
public void setActionMap(HashMap actionMap) {
this.actionMap = actionMap;
includeWaitPhrase = false;
}
/********************************
* Loads default ActionMap for ChemicalTagger.
********************************/
public void loadDefaultActionMap() {
// Add Tokens
actionMap.put("VB-ADD", "Add");
actionMap.put("NN-ADD", "Add");
actionMap.put("VB-CHARGE", "Add");
actionMap.put("NN-MIXTURE", "Add");
actionMap.put("VB-CONTAIN", "Add");
actionMap.put("VB-DILUTE", "Add");
actionMap.put("VB-DROP", "Add");
actionMap.put("VB-FILL", "Add");
actionMap.put("VB-TREAT", "Add");
// Apparatus Tokens
actionMap.put("VB-APPARATUS", "ApparatusAction");
// Concentrate Tokens
actionMap.put("VB-CONCENTRATE", "Concentrate");
actionMap.put("NN-CONCENTRATE", "Concentrate");
// Cool Tokens
actionMap.put("VB-COOL", "Cool");
// Degass Tokens
actionMap.put("VB-DEGASS", "Degass");
actionMap.put("VB-SUBJECT", "Degass");
// Dissolve Tokens
actionMap.put("VB-DISSOLVE", "Dissolve");
// Dry Tokens
actionMap.put("VB-DRY", "Dry");
// Extract Tokens
actionMap.put("VB-EXTRACT", "Extract");
actionMap.put("NN-EXTRACT", "Extract");
// Filter Tokens
actionMap.put("VB-FILTER", "Filter");
actionMap.put("NN-FILTER", "Filter");
// Heat Tokens
actionMap.put("VB-HEAT", "Heat");
actionMap.put("VB-INCREASE", "Heat");
actionMap.put("VB-SUBMERGE", "Heat");
// Partition Tokens
actionMap.put("VB-PARTITION", "Partition");
// Precipitate Tokens
actionMap.put("VB-PRECIPITATE", "Precipitate");
actionMap.put("NN-PRECIPITATE", "Precipitate");
// Purify Tokens
actionMap.put("VB-PURIFY", "Purify");
actionMap.put("NN-PURIFY", "Purify");
// Quench Tokens
actionMap.put("VB-QUENCH", "Quench");
// Recover Tokens
actionMap.put("VB-RECOVER", "Recover");
// Remove Tokens
actionMap.put("VB-REMOVE", "Remove");
actionMap.put("NN-REMOVE", "Remove");
// Stir Tokens
actionMap.put("VB-STIR", "Stir");
// Synthesize Tokens
actionMap.put("VB-SYNTHESIZE", "Synthesize");
actionMap.put("NN-SYNTHESIZE", "Synthesize");
// Wait Tokens
actionMap.put("VB-WAIT", "Wait");
// Wash Tokens
actionMap.put("VB-WASH", "Wash");
// Yield Tokens
actionMap.put("VB-YIELD", "Yield");
}
/**********************************************
* Adds action phrases and roles to documents.
*
* @param doc
* (Document)
* @return processedDoc (Document)
********************************************/
public Document process(Document doc) {
if (actionMap.size() == 0) {
loadDefaultActionMap();
}
Element root = new Element("Document");
Nodes nodes = doc.query("//Sentence");
for (int i = 0; i < nodes.size(); i++) {
Element sentenceNode = (Element) nodes.get(i);
Element newSentenceNode = processActionPhrases(sentenceNode);
root.appendChild(newSentenceNode);
}
processDissolve(root);
assignRolesByFollowingWord(root);
Document processedDoc = new Document(root);
return processedDoc;
}
/******************************************
* Converts DissolvePhrases (as they are recognised by the ANTLR grammar),
* and converts them into "ActionPhrase type='Dissolve'".
*
* @param dissolveElement
* (Element)
* @return dissolveElement (Element)
********************************/
private Element processDissolve(Element dissolveElement) {
Nodes nodes = dissolveElement.query("//DissolvePhrase");
for (int i = 0; i < nodes.size(); i++) {
Element newDissolveElement = (Element) nodes.get(i);
newDissolveElement.setLocalName("ActionPhrase");
Attribute attribute = new Attribute("type", "Dissolve");
newDissolveElement.addAttribute(attribute);
processSolvent(newDissolveElement);
}
return dissolveElement;
}
/**
* Looks for the pattern OSCARCM followed by NN-CHEMENTITY
* to assign things like "ethanol solvent" as solvents
* or "palladium catalyst" as catalysts
* @param root
*/
private void assignRolesByFollowingWord(Element root) {
Nodes oscarcms = root.query("//OSCARCM");
for (int i = 0; i < oscarcms.size(); i++) {
Element oscarcm = (Element) oscarcms.get(i);
Element nextEl = Utils.getNextTerminalElement(oscarcm);
if (nextEl!=null && nextEl.getLocalName().equals("NN-CHEMENTITY")){
Element molecule = (Element) oscarcm.getParent();
if (molecule !=null && molecule.getLocalName().equals("MOLECULE")){
String nnChementity = nextEl.getValue().toLowerCase();
if (nnChementity.contains("solvent")){
molecule.addAttribute(new Attribute("role", "Solvent"));
}
else if (nnChementity.contains("catalyst") || nnChementity.contains("accelerant")){
molecule.addAttribute(new Attribute("role", "Catalyst"));
}
}
}
}
}
/******************************************
* Adds action phrases to the documents.
*
* @param sentenceNode
* (Element)
* @return newSentence (Element)
********************************/
private Element processActionPhrases(Element sentenceNode) {
Element newSentence = new Element("Sentence");
List elementList = new ArrayList();
boolean seenVerbOrAtionNoun = false;// a verb or a noun like
// purification
Element actionPhrase = null;
Elements sentenceChildren = sentenceNode.getChildElements();
for (int i = 0; i < sentenceChildren.size(); i++) {
Element phraseElement = sentenceChildren.get(i);
String actionElementName = findFirstActionElementNameOutsideOfAMolecule(phraseElement);
if (actionElementName != null
|| phraseElement.getLocalName().equals("VerbPhrase")) {
if (seenVerbOrAtionNoun) {
if (actionPhrase != null) {// This the start of a new
// phrase, so add all seen
// elements into the previous
// actionPhrase
addListToParentNode(actionPhrase, elementList);
appendActionPhrase(newSentence, actionPhrase);
actionPhrase = null;
} else {
addListToParentNode(newSentence, elementList);
}
elementList = new ArrayList();
}
seenVerbOrAtionNoun = true;
elementList.add(phraseElement);
//TODO is this condition a good idea?
if (!hasNNExampleOutsideOfProcedure(elementList)) {//not something like "example of synthesis"
if (actionElementName != null) {
actionPhrase = new Element("ActionPhrase");
Attribute attribute = new Attribute("type",
actionMap.get(actionElementName));
actionPhrase.addAttribute(attribute);
}
else{
List elementNames = elementListToSelfAndDescendentElementNames(elementList);
if (elementNames.contains("TimePhrase")
&& includeWaitPhrase) {
Attribute attribute = new Attribute("type", "Wait");
actionPhrase = createActionPhrase(elementList,
attribute);
appendActionPhrase(newSentence, actionPhrase);
actionPhrase = null;
elementList = new ArrayList();
seenVerbOrAtionNoun = false;
} else if (elementNames.contains("MultipleApparatus")) {
Attribute attribute = new Attribute("type",
"ApparatusAction");
actionPhrase = createActionPhrase(elementList,
attribute);
appendActionPhrase(newSentence, actionPhrase);
actionPhrase = null;
elementList = new ArrayList();
seenVerbOrAtionNoun = false;
}
}
}
} else if (splitList.contains(phraseElement.getLocalName()
.toLowerCase())) {
if (actionPhrase != null) {
addListToParentNode(actionPhrase, elementList);
appendActionPhrase(newSentence, actionPhrase);
elementList = new ArrayList();
actionPhrase = null;
} else {
// add nodes to sentence if a verbOrAtionNoun has been seen,
// otherwise keep waiting for an action term
if (seenVerbOrAtionNoun) {
addListToParentNode(newSentence, elementList);
elementList = new ArrayList();
} else {
List elementNames = elementListToSelfAndDescendentElementNames(elementList);
if (elementNames.contains("TimePhrase")
&& includeWaitPhrase) {
Attribute attribute = new Attribute("type", "Wait");
actionPhrase = createActionPhrase(elementList,
attribute);
appendActionPhrase(newSentence, actionPhrase);
actionPhrase = null;
elementList = new ArrayList();
}
}
}
if (elementList.isEmpty()) {// append the punctuation directly
// if they are not within a phrase
newSentence.appendChild(new Element(phraseElement));
} else {
elementList.add(phraseElement);
}
seenVerbOrAtionNoun = false;
} else {
elementList.add(phraseElement);
}
}
if (elementList.size() > 0) {
if (actionPhrase != null) {
addListToParentNode(actionPhrase, elementList);
appendActionPhrase(newSentence, actionPhrase);
}
else{
addListToParentNode(newSentence, elementList);
}
}
newSentence = checkForRolePrepPhrase(newSentence);
return newSentence;
}
/**
* Are any of the given elements or their descendants NN-EXAMPLE nodes outside of a PROCEDURE node
* @param elementList
* @return
*/
private boolean hasNNExampleOutsideOfProcedure(List elementList) {
for (Element element : elementList) {
Nodes exampleNodes = element.query(".//NN-EXAMPLE");
for (int i = 0; i < exampleNodes.size(); i++) {
Element exampleEl = (Element) exampleNodes.get(i);
if (!((Element)exampleEl.getParent()).getLocalName().equals("PROCEDURE")){
return true;
}
}
}
return false;
}
/****************************************
* Adds solvent roles for nodes within the ActionPhrases. Checks within
* dissolve,wash and extract phrases.
*
* @param actionElement
* (Element)
* @return actionElement (Element)
****************************************/
private Element processSolvent(Element actionElement) {
if (!actionElement.getLocalName().toLowerCase()
.contains("actionphrase")) {
Nodes actionNodes = actionElement.query(".//ActionPhrase");
if (actionNodes.size() == 0) {
actionElement = null;
} else {
actionElement = (Element) actionNodes.get(0);
}
}
if (actionElement != null) {
if (actionElement.getAttributeValue("type").equals("Dissolve")) {
addSolventRole(actionElement, "IN-IN");
}
if (actionElement.getAttributeValue("type").equals("Wash")) {
addSolventRole(actionElement, "IN-WITH");
}
if (actionElement.getAttributeValue("type").equals("Extract")) {
addSolventRole(actionElement, "IN-WITH");
}
if (actionElement.getAttributeValue("type").equals("Add")) {
if (actionElement.query(".//VB-DILUTE").size() > 0) {
addSolventRole(actionElement, "IN-WITH");
}
}
if (actionElement.getAttributeValue("type").equals("Purify")) {
if ((actionElement.query(".//NN-CHROMATOGRAPHY").size() > 0)
&& (actionElement.query(".//MIXTURE").size() > 0)) {
findMixtureSolvents(actionElement);
}
}
}
return actionElement;
}
/****************************************
* Adds ActionPhrase tags to the document.
*
* @param newSentence
* (Element)
* @param actionElement
* (Element)
**********************************/
private void appendActionPhrase(Element newSentence, Element actionElement) {
actionElement = processSolvent(actionElement);
newSentence.appendChild(actionElement);
}
/********************************************************************************
* Checks for role in preparation phrases that have the format 'Using hexane
* as an eluent/solvent'.
*
* @param newSentence
* (Element)
* @return newSentence (Element)
**************************************************************************/
private Element checkForRolePrepPhrase(Element newSentence) {
Nodes nodes = newSentence.query(".//RolePrepPhrase");
if (nodes.size() == 0) {
return newSentence;
} else {
for (int i = 0; i < nodes.size(); i++) {
Node roleNode = nodes.get(i);
Element rolePhrase = (Element) roleNode;
String role = getRole(rolePhrase);
if (role != null) {
Element previousElement = getPreviousElementOrElementBeforeVerbs(rolePhrase);
if (previousElement !=null &&
(previousElement.getLocalName().equalsIgnoreCase("nounphrase")
|| previousElement.getLocalName().equalsIgnoreCase("prepphrase"))) {
setRole(previousElement, role);
}
}
}
}
return newSentence;
}
private Element getPreviousElementOrElementBeforeVerbs(Element rolePhrase) {
Element previousElement = Utils.getPreviousSiblingOrParentsSibling(rolePhrase);
List elementsToIgnore = Arrays.asList("vbd", "vbn", "vbz", "comma");
while (previousElement !=null && elementsToIgnore.contains(previousElement.getLocalName().toLowerCase())){
previousElement = Utils.getPreviousSiblingOrParentsSibling(previousElement);
}
return previousElement;
}
/*****************************************
* Adds a role attribute to Molecule Nodes.
*
* @param previousElement
* (Element)
* @param role
* (String)
******************************************/
private void setRole(Element previousElement, String role) {
Nodes moleculeNodes = previousElement.query(".//MOLECULE");
for (int i = 0; i < moleculeNodes.size(); i++) {
Element moleculeElement = (Element) moleculeNodes.get(i);
moleculeElement.addAttribute(new Attribute("role", role));
}
}
/*******************************************************************
* This gets the role from preposition phrases that start with 'As'. E.g: As
* a solvent/ as an eluent .
*
* @param rolePhrase
* (Element)
* @return role (String) /
*****************************************************************/
private String getRole(Element rolePhrase) {
String role = null;
Nodes roleNameNodes = rolePhrase.query(".//NN-CHEMENTITY");
if (roleNameNodes.size() != 1) {
return null;
} else {
Node roleNameNode = roleNameNodes.get(0);
role = roleNameNode.getValue();
}
if (role.toLowerCase().contains("eluent")
|| role.toLowerCase().contains("solvent")) {
role = "Solvent";
}
else if (role.toLowerCase().contains("catalyst")
|| role.toLowerCase().contains("accelerant")) {
role = "Catalyst";
}
return role;
}
/*************************************************************************
* Takes a list of elements and returns a list of their localnames.
*
* @param elementList
* (List)
* @return elementNames (List)
**************************************************************************/
private List elementListToSelfAndDescendentElementNames(
List elementList) {
List elementNames = new ArrayList();
for (Element element : elementList) {
elementNames
.addAll(getElementAndDescendantElementNameList(element));
}
return elementNames;
}
/*************************************************************************
* Given an element returns in document order the element's descendants
* localnames. The startingElement's localname will be the first in the list
*
* @param startingElement
* (Element)
* @return elementNames (List)
************************************************************************/
private List getElementAndDescendantElementNameList(
Element startingElement) {
List elementNames = new ArrayList();
elementNames.add(startingElement.getLocalName());
LinkedList stack = new LinkedList();
Elements children = startingElement.getChildElements();
for (int i = children.size() - 1; i >= 0; i--) {
stack.add(children.get(i));
}
while (stack.size() > 0) {
Element currentElement = stack.removeLast();
elementNames.add(currentElement.getLocalName());
children = currentElement.getChildElements();
for (int i = children.size() - 1; i >= 0; i--) {
Element child = children.get(i);
stack.add(child);
}
}
return elementNames;
}
/****************************************************
* Adds a list of Elements to a parent node.
*
* @param parentNode
* (Element)
* @param elementList
* (List)
****************************************************/
private void addListToParentNode(Element parentNode,
List elementList) {
for (Element element : elementList) {
Element newElement = (Element) element.copy();
parentNode.appendChild(newElement);
}
}
/*******************************************************************************
* Creates an actionPhrase element with the given children and attribute.
*
* @param children
* (List)
* @param attribute
* (Attribute)
* @return actionPhrase (Element)
********************************************************************************/
private Element createActionPhrase(List children,
Attribute attribute) {
Element actionPhrase = new Element("ActionPhrase");
actionPhrase.addAttribute(attribute);
addListToParentNode(actionPhrase, children);
return actionPhrase;
}
/**********************************************
* Searches for solvents in mixture phrases.
*
* @param actionElement
* (Element)
**********************************************/
private void findMixtureSolvents(Element actionElement) {
Element mixtureElement = (Element) actionElement.query(".//MIXTURE")
.get(0);
Nodes oscarCMNodes = mixtureElement.query(".//OSCARCM");
for (int i = 0; i < oscarCMNodes.size(); i++) {
Element oscarCMElement = (Element) oscarCMNodes.get(i);
Element moleculeElement;
if (((Element)oscarCMElement.getParent()).getLocalName().equals("MOLECULE")){
moleculeElement = (Element) oscarCMElement.getParent();
}
else{
Element newElement = (Element) oscarCMElement.copy();
oscarCMElement.setLocalName("MOLECULE");
oscarCMElement.removeChildren();
oscarCMElement.appendChild(newElement);
moleculeElement = oscarCMElement;
}
moleculeElement.addAttribute(new Attribute("role", "Solvent"));
}
}
/***********************************************************************
* Adds solvent roles to molecule nodes. Searches for molecule nodes that
* are after the given preposition and adds a role="Solvent" attribute.
*
* @param solventElement
* (Element)
* @param preposition
* (String)
***********************************************************************/
private void addSolventRole(Element solventElement, String preposition) {
if (solventElement.getLocalName().contains("Unmatched")) {
return;
}
boolean seenPreposition = false;
boolean foundAtleastOneSolvent = false;
LinkedList elementsToInvestigate = new LinkedList();
Elements children = solventElement.getChildElements();
for (int i = 0; i < children.size(); i++) {
elementsToInvestigate.add(children.get(i));
}
while (!elementsToInvestigate.isEmpty()) {
Element elementToInvestigate = elementsToInvestigate.removeFirst();
String localNameLC = elementToInvestigate.getLocalName().toLowerCase();
if (localNameLC.contains("phrase") || localNameLC.contains("ratio")) {//children of phrases will be recursively investigated
Elements elChildren = elementToInvestigate.getChildElements();
for (int i = elChildren.size() -1; i >=0 ; i--) {
elementsToInvestigate.add(0, elChildren.get(i));
}
}
else if (localNameLC.contains("molecule") && seenPreposition) {
if (foundAtleastOneSolvent && moleculeDoesNotLookLikeASolvent(elementToInvestigate)){
return;
}
elementToInvestigate.addAttribute(new Attribute("role", "Solvent"));
foundAtleastOneSolvent = true;
}
else if (elementToInvestigate.getLocalName().equals(preposition)) {
seenPreposition = true;
}
}
}
/**
* Checks for the presence of a mass or an amount.
* These are rarely specified for a solvent
* @param elementToInvestigate
* @return
*/
private boolean moleculeDoesNotLookLikeASolvent(Element molecule) {
return molecule.query(".//MASS").size() >0 || molecule.query(".//AMOUNT").size() >0;
}
/**********************************************************************
* Given an element searches through its descendants in document order and
* returns the first element with a localname corresponding to an entry in
* the actionMap or null if none of their names are present in the
* actionMap.
*
* Molecule and UnnamedMolecule elements are not recursively investigated to prevent
* mistakes such as "Compound obtained from example 4" which is not a yield
*
* @param startingElement
* (Element)
* @return elementName (String)
**********************************************************************/
private String findFirstActionElementNameOutsideOfAMolecule(Element startingElement) {
LinkedList stack = new LinkedList();
Elements children = startingElement.getChildElements();
for (int i = children.size() - 1; i >= 0; i--) {
stack.add(children.get(i));
}
while (stack.size() > 0) {
Element currentElement = stack.removeLast();
String elementName = currentElement.getLocalName();
if (actionMap.containsKey(elementName)) {
return elementName;
}
if (elementName.equals("UNNAMEDMOLECULE")|| elementName.equals("MOLECULE")){
continue;
}
children = currentElement.getChildElements();
for (int i = children.size() - 1; i >= 0; i--) {
Element child = children.get(i);
stack.add(child);
}
}
return null;
}
}
RecombineTokens.java 0000664 0000000 0000000 00000022523 14163277713 0033764 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/******************************************
* Combines tokens that have been erroneously split
* @author lh359
*******************************************/
public class RecombineTokens {
private static Set nonHyphenTags = new HashSet(Arrays.asList("dash", "comma", "cc", "stop"));
private static Set quantityUnitTags = new HashSet(Arrays.asList("nn-vol", "nn-area", "nn-perarea", "nn-pressure", "nn-parts", "nn-moles", "nn-persecond", "nn-timeunit", "nn-mass", "nn-pertimeunit", "nn-vol", "nn-amount", "nn-units"));
/****************************
* Hides Utility Class Constructor.
***************************/
private RecombineTokens(){
}
/*********************************************
* Indexes the tokens that need combining and then calls combineTokens.
* @param posContainer (POSContainer) .
* @return posContainer (POSContainer) .
*********************************************/
public static POSContainer recombineTokens(POSContainer posContainer) {
List previousIndexList = new ArrayList();
List wordTokenList = posContainer.getWordTokenList();
List combinedTagList = posContainer.getCombinedTagsList();
LinkedHashMap> indexMap = new LinkedHashMap>();
for (int currentIndex = 0; currentIndex < wordTokenList.size(); currentIndex++) {
List indexList = new ArrayList();
String currentTagLc = combinedTagList.get(currentIndex).toLowerCase();
if (currentTagLc.equals("dash")) {
if (currentIndex == 0
&& currentIndex + 1 < wordTokenList.size()) {
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
} else if (currentIndex + 1 == wordTokenList.size()) {
indexList.add(currentIndex - 1);
indexList.add(currentIndex);
indexMap.put(indexList.get(0), indexList);
} else {
String previousTag = combinedTagList.get(currentIndex - 1);
String nextTag = combinedTagList.get(currentIndex + 1);
if (!(previousTag.startsWith("OSCAR-CM")
&& nextTag.startsWith("OSCAR-CM") && !wordTokenList.get(currentIndex + 1).getSurface().startsWith("-"))
&& !(nextTag.startsWith("CD") && previousTag.startsWith("NN")) && !isAHyphenedUnit(previousTag,nextTag)) {
if (previousIndexList.contains(currentIndex - 1)) {
//previous token was involved in a recombination, hence need to append to its indexList
indexList = previousIndexList;
if (!indexList.contains(currentIndex)) {
indexList.add(currentIndex);
}
if (!indexList.contains(currentIndex + 1)) {
indexList.add(currentIndex + 1);
}
indexMap.put(indexList.get(0), indexList);
} else if (nonHyphenTags.contains(previousTag.toLowerCase())) {
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
} else if (nonHyphenTags.contains(nextTag.toLowerCase())) {
indexList.add(currentIndex - 1);
indexList.add(currentIndex);
indexMap.put(indexList.get(0), indexList);
} else {
indexList.add(currentIndex - 1);
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
}
}
else if (wordTokenList.get(currentIndex).getSurface().equals("/")
&& previousTag.equals("NN-AMOUNT") && nextTag.equals("NN-VOL")){
indexList.add(currentIndex - 1);
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
combinedTagList.set(currentIndex - 1, "NN-MOLAR");
combinedTagList.set(currentIndex + 1, "NN-MOLAR");
}
}
}
else if (currentTagLc.equals("nn-temp")) {//Identifies cases such as "50C . was" and corrects them to "50C. was"
if (wordTokenList.get(currentIndex).getSurface().toLowerCase().endsWith("c") && currentIndex >0 && currentIndex + 2 < wordTokenList.size()){
String nextTag = combinedTagList.get(currentIndex + 1);
if (nextTag.equalsIgnoreCase("stop")){
String previousWord = wordTokenList.get(currentIndex - 1).getSurface();
if (Character.isDigit(previousWord.charAt(previousWord.length()-1))){
String wordAfterStop = wordTokenList.get(currentIndex + 2).getSurface();
//if appears to be the start of a new sentence
if (!Character.isUpperCase(wordAfterStop.charAt(0)) && !Character.isDigit(wordAfterStop.charAt(0))){
indexList = new ArrayList();
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
}
}
}
}
}
else if (currentTagLc.equals("nn-time")) {//Identifies cases such as "min . and" and corrects them to "min. and"
if (wordTokenList.get(currentIndex).getSurface().equalsIgnoreCase("min") && currentIndex >0 && currentIndex + 2 < wordTokenList.size()){
String nextTag = combinedTagList.get(currentIndex + 1);
if (nextTag.equalsIgnoreCase("stop")){
String wordAfterStop = wordTokenList.get(currentIndex + 2).getSurface();
if (!Character.isUpperCase(wordAfterStop.charAt(0))){
indexList = new ArrayList();
indexList.add(currentIndex);
indexList.add(currentIndex + 1);
indexMap.put(indexList.get(0), indexList);
}
}
}
}
if (!indexList.isEmpty()){
previousIndexList = indexList;
}
}
return combineTokens(posContainer, indexMap);
}
/***********************************
* Checks if both tags are units separated by slashes
* @param previousTag
* @param nextTag
* @return boolean
*/
private static boolean isAHyphenedUnit(String previousTag, String nextTag) {
return quantityUnitTags.contains(previousTag.toLowerCase()) && quantityUnitTags.contains(nextTag.toLowerCase());
}
/*****************************************
* Combines the tokens based on the indices in indexMap.
* @param posContainer (POSContainer)
* @param indexMap (LinkedHashMap)
* @return posContainer (POSContainer)
*****************************************/
private static POSContainer combineTokens(POSContainer posContainer, LinkedHashMap> indexMap) {
if (indexMap.size() > 0) {
List newWordTokenList = new ArrayList();
List newCombinedTagsList = new ArrayList();
int tokenIndex = 0;
for (int i = 0; i < posContainer.getWordTokenList().size(); i++) {
Token newToken;
if (!indexMap.keySet().contains(i)) {
newToken = posContainer.getWordTokenList().get(i);
newCombinedTagsList.add(posContainer.getCombinedTagsList().get(i));
} else {
List indexList = indexMap.get(i);
String tagName = getTagName(posContainer,indexList);
StringBuilder multiTokenWord = new StringBuilder();
for (Integer integer : indexList) {
multiTokenWord.append(posContainer.getWordTokenList().get(integer).getSurface());
}
int start = posContainer.getWordTokenList().get(indexList.get(0)).getStart();
int end = posContainer.getWordTokenList().get(indexList.get(indexList.size()-1)).getEnd();
newToken = new Token(multiTokenWord.toString(), start, end, null, posContainer.getWordTokenList().get(indexList.get(indexList.size()-1)).getBioType(), null);
newCombinedTagsList.add(tagName);
i = i + indexList.size() - 1;
}
newToken.setIndex(tokenIndex++);
newWordTokenList.add(newToken);
}
posContainer.setWordTokenList(newWordTokenList);
posContainer.setCombinedTagsList(newCombinedTagsList);
}
return posContainer;
}
/*****************************************
* Creates a new tagname for the combined tokens.
* Sets the tag to JJ-CHEM if one of the tags are an
* adjective or verb in the past tense.
* @param posContainer (POSContainer)
* @param indexList (List)
* @return tagName (String)
*****************************************/
private static String getTagName(POSContainer posContainer,List indexList) {
String tagName = "";
List jjChemList = Arrays.asList("jj", "vbn", "jj-chem");
for (Integer integer : indexList) {
String tag = posContainer.getCombinedTagsList().get(integer);
if (!tagName.toLowerCase().startsWith("oscar") & tag.contains("-")){
tagName = tag;
}
if (tagName.equals("") & !tag.toLowerCase().equals("dash")){
tagName = tag;
}
if (jjChemList.contains(tag.toLowerCase())){
tagName = "JJ-CHEM";
}
}
return tagName;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/RegexTagger.java 0000664 0000000 0000000 00000010351 14163277713 0033154 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/*****************************************************
* Runs the regular expression tagger .
*
* @author lh359, dl387, pm286
*****************************************************/
public class RegexTagger implements Tagger{
private List rules;
private String tagFilePath = "/uk/ac/cam/ch/wwmm/chemicaltagger/regexTagger/regexTags.txt";
private static Logger LOG = Logger.getLogger(RegexTagger.class);
private List ignoredTags = new ArrayList();
/****************************
* Public Constructor.
***************************/
public RegexTagger() {
initializeRules();
}
/****************************
* Public Constructor.
***************************/
public RegexTagger(List ignoredTags) {
this.ignoredTags = ignoredTags;
initializeRules();
}
/**********************
* Public Constructor.
* Sets a tagFile.
* @param tagFile (String)
*/
public RegexTagger(String tagFile) {
this.tagFilePath = tagFile;
}
/**************************************
* Getter method for rules.
* @return rules (List)
***************************************/
public List getRules() {
return rules;
}
/**************************************
* Setter method for rules.
* @param rules (List)
***************************************/
public void setRules(List rules) {
this.rules = rules;
}
/**************************************************************
* Initialises the rules for the regular expression tagger.
***************************************************************/
protected void initializeRules() {
rules = new ArrayList();
String line;
try {
InputStream is = Utils.getInputStream(getClass(), tagFilePath);
BufferedReader in = new BufferedReader(new InputStreamReader(is,
Charset.forName("UTF-8")));
if (!in.ready()) {
throw new IOException();
}
while ((line = in.readLine()) != null) {
if (!line.startsWith("#") && !StringUtils.isEmpty(line)) {
String[] lineTokens = line.split("---");
if (lineTokens.length > 1 && !ignoredTags.contains(lineTokens[0])) {
rules.add(new Rule(lineTokens[0], lineTokens[1]));
}
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/*********************************************************
* Runs the regular expression tagger against a list of tokens and returns a list of tags
* @param tokenList (List)
* @param inputSentence (String)
* @return tagList (List)
/*********************************************************/
public List runTagger(List tokenList, String inputSentence) {
Pattern p = Pattern.compile("");
List tagList = new ArrayList();
for (Token token : tokenList) {
try {
Matcher m = p.matcher(token.getSurface());
String tag = "nil";
for (Rule r : rules) {
if (m.usePattern(r.getPattern()).lookingAt()) {
tag = r.getName();
break;
}
}
tagList.add(tag);
} catch (Exception e) {
LOG.debug("Null pointer right there" + tokenList);
}
}
return tagList;
}
public List getIgnoredTags() {
return ignoredTags;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/Rule.java 0000664 0000000 0000000 00000004577 14163277713 0031674 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.regex.Pattern;
/**************************************************************
* The Rule class . Compiles regex rules. Used later for the regex tagger.
***************************************************************/
public class Rule {
private String name;
private Pattern pattern;
/**************************************
* Getter method for name.
* @return name (String)
***************************************/
public String getName() {
return name;
}
/**************************************
* Getter method for pattern.
* @return pattern (String)
***************************************/
public Pattern getPattern() {
return pattern;
}
/**************************************
* Public Constructor method.
* Sets the name and regex pattern and set caseInsensitive to true by default.
* @param name (String)
* @param regex (String)
************************************/
public Rule(String name, String regex) {
setRule(name, regex, true);
}
/***************************************
* Public Constructor method.
* @param name (String)
* @param regex (String)
* @param caseInsensitive (String)
*/
public Rule(String name, String regex, boolean caseInsensitive) {
setRule(name, regex, caseInsensitive);
}
/****************************************
* Compiles a new rule and sets the tag.
* @param name (String: name of the tag)
* @param regex (String: Regex Pattern)
* @param caseInsensitive (Boolean)
*/
private void setRule(String name, String regex, boolean caseInsensitive) {
this.name = name;
if (caseInsensitive){
pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
}
else {
pattern = Pattern.compile(regex);
}
}
}
SentenceParser.java 0000664 0000000 0000000 00000007302 14163277713 0033614 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.io.IOException;
import java.io.InputStream;
import nu.xom.Document;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.Tree;
import org.apache.commons.io.IOUtils;
/***********************************************
* Passes tagged sentences to the ANTLR grammar.
* And converts the output to an XML document.
*
* @author pm286, dl387, lh359
**********************************************/
public abstract class SentenceParser extends Thread {
private InputStream taggedTokenInStream = null;
private Tree parseTree = null;
/**********************************************
* Constructor method for inputStream objects.
*
* @param taggedTokenInputStream (InputStream)
*********************************************/
public SentenceParser(InputStream taggedTokenInputStream) {
this.taggedTokenInStream = taggedTokenInputStream;
}
/*******************************************
* Constructor method for String objects.
*
* @param taggedTokenString (String)
*******************************************/
public SentenceParser(String taggedTokenString) {
this.taggedTokenInStream = IOUtils.toInputStream(taggedTokenString,"UTF-8");
}
/************************************************
* Constructor method for POSContainer objects.
*
* @param posContainer (POSContainer)
*******************************************/
public SentenceParser(POSContainer posContainer) {
this.taggedTokenInStream = IOUtils.toInputStream(
posContainer.getTokenTagTupleAsString(), "UTF-8");
}
/********************************************
* Getter method for taggedTokenInputStream.
* @return taggedTokenInStream (InputStream)
******************************************/
public InputStream getTaggedTokenInStream() {
return taggedTokenInStream;
}
/*******************************************
* Setter method for taggedTokenInputStream.
*
* @param taggedTokenInStream (InputStream)
***********************************/
public void setTaggedTokenInStream(InputStream taggedTokenInStream) {
this.taggedTokenInStream = taggedTokenInStream;
}
/**************************************
* Getter method for parseTree.
* @return parseTree (Tree)
**************************************/
public Tree getParseTree() {
return parseTree;
}
/**************************************
* Setter method for parseTree.
* @param parseTree (Tree)
**************************************/
public void setParseTree(ParseTree parseTree) {
this.parseTree = parseTree;
}
/******************
* Runs the thread.
*/
public void run() {
parseTags();
}
/********************************************
* Passes an inputstream to ANTLR and produces
* a parse tree.
*
*******************************************/
public abstract void parseTags();
/*********************************************
* Creates an XML document from the parseTree.
* @return document (Document)
*******************************************/
public abstract Document makeXMLDocument();
}
SpectraTagger.java 0000664 0000000 0000000 00000005554 14163277713 0033435 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import nu.xom.Element;
import uk.ac.cam.ch.wwmm.oscar.document.ProcessingDocument;
import uk.ac.cam.ch.wwmm.oscar.document.ProcessingDocumentFactory;
import uk.ac.cam.ch.wwmm.oscardata.DataAnnotation;
import uk.ac.cam.ch.wwmm.oscardata.DataParser;
import uk.ac.cam.ch.wwmm.oscartokeniser.Tokeniser;
/*****************************************************
* Runs the oscar SpectraTagger to pull out the NMR Spectra.
*
* @author lh359, dl387
*****************************************************/
public class SpectraTagger {
/**************************
* Hides Utility Class Constructor.
*/
private SpectraTagger(){
}
/*******************************************************
* Tags the NMR Spectra in a chemistry text.
* @param posContainer (POSContainer)
* @return posContainer (POSContainer)
*******************************************************/
public static POSContainer runTagger(POSContainer posContainer) {
Tokeniser tokeniser = Tokeniser.getDefaultInstance();
ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(tokeniser, posContainer.getInputText());
List annotations = DataParser.findData(procDoc);
StringBuilder newInputText = new StringBuilder();
String sentence = posContainer.getInputText();
int offset = 0;
List spectraKeywords = Arrays.asList("spectrum", "hrms", "rf");
List spectraList = new ArrayList();
for (DataAnnotation dataAnnotation : annotations) {
if (spectraKeywords.contains(dataAnnotation.getAnnotatedElement().getLocalName()) || (dataAnnotation.getAnnotatedElement().getLocalName().equals("property") && spectraKeywords.contains(dataAnnotation.getAnnotatedElement().getAttribute("type")))) {
spectraList.add(dataAnnotation.getAnnotatedElement());
newInputText.append(sentence.substring(offset, dataAnnotation
.getStart()));
offset = dataAnnotation.getEnd();
}
}
newInputText.append(sentence.substring(offset, sentence.length()));
posContainer.setInputText(newInputText.toString());
posContainer.setSpectrumList(spectraList);
return posContainer;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/Tagger.java 0000664 0000000 0000000 00000002630 14163277713 0032162 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.List;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/*****************************************************
* An interface for a tagger.
* The tagger given a list of tokens should return a tag for each token
*
* @author lh359
*****************************************************/
public interface Tagger {
/*****************************************************
* Runs the tagger against the tokens and returns the tags as a list
* @param inputSentence
* @param tokenList (List)
* @return posContainer (POSContainer)
*****************************************************/
public List runTagger(List tokenList, String inputSentence);
public List getIgnoredTags() ;
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/Utils.java 0000664 0000000 0000000 00000026357 14163277713 0032065 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Elements;
import nu.xom.Node;
import nu.xom.ParentNode;
import nu.xom.Serializer;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
/********************************************
* A Utils class.
*
* @author lh359, pm286
********************************************/
public class Utils {
private static Pattern exponentXMLPattern = Pattern.compile("(-?\\d+)(-?\\d+)");
/**************************
* Hides Utility Class Constructor.
*/
private Utils(){
}
/******************************************
* Replaces all non-XML characters with _ .
*
* @param text (String)
* @return ncName (String)
*******************************************/
public static String makeNCName(String text) {
String ncName = text;
if (text == null) {
ncName = "emptyName";
} else if (ncName.trim().length() == 0) {
ncName = "emptyName";
} else {
ncName = ncName.trim();
char c = ncName.charAt(0);
if (!Character.isLetter(c) && c != '_') {
ncName = '_' + ncName;
}
ncName = ncName.replaceAll("[^A-Za-z0-9_.-]", "_");
}
return ncName;
}
/******************************************
* Writes out an XML document to a file.
* @param doc (Document)
* @param xmlFilename (String)
*******************************************/
public static void writeXMLToFile(Document doc, String xmlFilename) {
try {
Serializer serializer = new Serializer(new FileOutputStream(
xmlFilename), "UTF-8");
serializer.write(doc);
serializer.flush();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/******************************************
* Cleans up text from html characters.
* @param paragraph (String)
* @return cleanedParagraph (String)
*******************************************/
public static String cleanHTMLText(String paragraph) {
String cleanedParagraph = paragraph;
cleanedParagraph = convertExponentials(cleanedParagraph);
cleanedParagraph = Jsoup.clean(cleanedParagraph,new Whitelist());
cleanedParagraph =Jsoup.parse(cleanedParagraph).text();
return cleanedParagraph;
}
/**********************************
* Converts exponents in the format of 1016 to 10 exp=16.
* This is mainly to avoid loss when cleaning xml tags.
* @param xmlExponential (String)
* @return nonXMLExponential (String).
*/
private static String convertExponentials(String xmlExponential) {
String nonXMLExponential = xmlExponential;
Matcher exponentialMatcher = exponentXMLPattern.matcher(xmlExponential);
if (exponentialMatcher.find()){
nonXMLExponential = exponentialMatcher.replaceAll(exponentialMatcher.group(1)+"^"+exponentialMatcher.group(2));
}
return nonXMLExponential;
}
/***********************************************************
* Loads a "sentence" file consisting of a single line of text. Has to be
* qualified file name e.g. uk/ac/cam/ch/wwmm/foo.txt.
*
* @param resourceName (String)
* @return sentence (String)
**************************************************************/
public static String readSentence(String resourceName) {
// requires sentence with no newlines except possibly at end
String sentence = null;
try {
InputStream refStream = ClassLoader
.getSystemResourceAsStream(resourceName);
sentence = IOUtils.toString(refStream, "UTF-8");
} catch (IOException e) {
throw new RuntimeException("Cannot read sentence: " + resourceName);
}
return sentence.trim();
}
/**************************************************************
* Returns the content of the resource as an inputstream.
*
* @param pathName (String)
* @return sentence (String)
* @throws IOException
**************************************************************/
public static String getPathAsInputStream(String pathName)
throws IOException {
String sentence = null;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(
new FileInputStream(new File(pathName)), "UTF-8"));
sentence = br.readLine();
} catch (IOException e) {
throw new RuntimeException("Cannot read sentence: " + pathName);
}
finally{
IOUtils.closeQuietly(br);
}
return sentence.trim();
}
/**************************************************************
* Returns the content of the resource as an inputstream.
*
* @param context (Class)
* @param pathName (String)
* @return sentence (String)
* @throws IOException
**************************************************************/
public static InputStream getInputStream(Class> context, String pathName)
throws IOException {
InputStream inStream = context.getResourceAsStream(pathName);
if (inStream == null) {
throw new IOException("File not found: " + pathName
+ " (using context " + context.getName() + ")");
}
return inStream;
}
/***********************************
* Checks if a string contains a number.
*
* @param currentString (String)
* @return boolean
***********************************/
public static boolean containsNumber(String currentString) {
for (char c : currentString.toCharArray()) {
if (Character.isDigit(c)) {
return true;
}
}
return false;
}
/*******************************************
* Overloading method for running chemicalTagger
* Sets runSpectraTagger to false by default.
* @param text (String)
* @return doc (Document)
*****************************************/
public static Document runChemicalTagger(String text) {
return runChemicalTagger(text, false);
}
/*******************************************
* Convenience method for running chemicalTagger
* Includes a flag for setting spectraTagger.
* @param text (String)
* @param runSpectraTagger (boolean)
* @return doc (Document)
*****************************************/
public static Document runChemicalTagger(String text,
boolean runSpectraTagger) {
ChemistryPOSTagger chemPos = ChemistryPOSTagger.getDefaultInstance();
//ChemistryPOSTagger chemPos = new ChemistryPOSTagger(ctTokeniser, oscarTagger, regexTagger, openNLPTagger)
POSContainer posContainer = chemPos.runTaggers(text);
//POSContainer posContainer = chemPos.runTaggers(inputSentence, prioritiseOscar, useSpectraTagger)
ChemistrySentenceParser chemistrySentenceParser = new ChemistrySentenceParser(
posContainer);
chemistrySentenceParser.parseTags();
Document doc = chemistrySentenceParser.makeXMLDocument();
return doc;
}
/**************************************
* Loads the tags from the .tokens file generated by Antlr.
* @param contextClass
* @return HashSet
*/
public static HashSet loadsTagsFromFile(Class> contextClass) {
HashSet tagSet = new HashSet();
String pathName = "ChemicalChunker.tokens";
InputStream inStream = contextClass.getClassLoader().getResourceAsStream(pathName);
List readlines = null;
try {
readlines = IOUtils.readLines(inStream);
} catch (IOException e) {
throw new RuntimeException("Could not load tokens file",e);
}
for (String line : readlines) {
if (line.startsWith("'")){
String tag = line.split("'")[1];
tagSet.add(tag);
}
}
IOUtils.closeQuietly(inStream);
return tagSet;
}
/**
* Gets the next terminal element. This element need not be a sibling
* @param startingEl
* @return
*/
public static Element getNextTerminalElement(Element startingEl) {
ParentNode parent = startingEl.getParent();
if (parent == null || !(parent instanceof Element)){
return null;
}
int index = parent.indexOf(startingEl);
if (index +1 >=parent.getChildCount()){
return getNextTerminalElement((Element) parent);//reached end of element
}
Node nextNode = parent.getChild(index+1);
if (!(nextNode instanceof Element)){
return null;
}
Element next =(Element) nextNode;
Elements children =next.getChildElements();
while (children.size()!=0){
next =children.get(0);
children =next.getChildElements();
}
return next;
}
/**
* Gets the previous terminal element. This element need not be a sibling
* @param startingEl
* @return
*/
public static Element getPreviousTerminalElement(Element startingEl) {
ParentNode parent = startingEl.getParent();
if (parent == null || !(parent instanceof Element)){
return null;
}
int index = parent.indexOf(startingEl);
if (index ==0) {
return getPreviousTerminalElement((Element) parent);//reached beginning of element
}
Node previousNode = parent.getChild(index-1);
if (!(previousNode instanceof Element)){
return null;
}
Element previous =(Element) previousNode;
Elements children =previous.getChildElements();
while (children.size()!=0){
previous =children.get(children.size()-1);
children =previous.getChildElements();
}
return previous;
}
/**
* Gets the previous sibling if one exists or the previous sibling of the first parent element with previous siblings
* @param startingEl
* @return
*/
public static Element getPreviousSiblingOrParentsSibling(Element startingEl) {
ParentNode parent = startingEl.getParent();
if (parent == null || !(parent instanceof Element)){
return null;
}
int index = parent.indexOf(startingEl);
if (index ==0) {
return getPreviousSiblingOrParentsSibling((Element) parent);//reached beginning of element
}
Node previousNode = parent.getChild(index-1);
if (!(previousNode instanceof Element)){
return null;
}
return (Element) previousNode;
}
/********************************************
* Writes items of a list to a file.
* @param sentenceList
* @param filename
* @throws IOException
********************************************/
public static void writeListToFile(List sentenceList, String filename) throws IOException {
OutputStream os = new FileOutputStream(new File(filename));
IOUtils.writeLines(sentenceList, null, os, "UTF-8");
IOUtils.closeQuietly(os);
}
/**
* Concatenates token surfaces adding a space between them
* @param tokens
* @return
*/
public static String tokensToSpaceDelimitedStr(List tokens){
StringBuilder sb = new StringBuilder();
for (Token token : tokens) {
sb.append(token.getSurface());
sb.append(" ");
}
return sb.toString().trim();
}
}
WhiteSpaceTokeniser.java 0000664 0000000 0000000 00000003762 14163277713 0034621 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.types.BioTag;
import uk.ac.cam.ch.wwmm.oscar.types.BioType;
/****************************************
* A whitespace tokeniser to be used as a
* substitute to the OSCAR tokeniser.
* @author lh359
* @author dl387
*
*****************************************/
public class WhiteSpaceTokeniser implements ChemicalTaggerTokeniser {
private static Pattern tokenPattern = Pattern.compile("\\S+");
/*****************************
* Default constructor method.
***************************/
public WhiteSpaceTokeniser(){
}
/********************************************
* Tokenises a String on white space.
* @param inputSentence (String)
* @return List
*****************************************/
public List tokenise(String inputSentence){
List tokens = new ArrayList();
Matcher m = tokenPattern.matcher(inputSentence);
int tokenIndex = 0;
while (m.find()) {
int start = m.start();
int end = m.end();
String value = m.group();
Token t = new Token(value, start, end, null, new BioType(BioTag.O), null);
t.setIndex(tokenIndex++);
tokens.add(t);
}
return tokens;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/XMLtoAST.java 0000664 0000000 0000000 00000005420 14163277713 0032324 0 ustar 00root root 0000000 0000000 /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger;
import java.util.ArrayList;
import java.util.List;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Text;
import org.antlr.v4.runtime.ParserRuleContext;
/*****************************
* Converts ANTLR ASTTrees into XML Documents.
*
* @author lh359
*****************************/
public class XMLtoAST {
private List SentenceList;
private ParserRuleContext parseTree;
public XMLtoAST() {
SentenceList = new ArrayList();
}
/********************************************
* Converts AST Trees to XML Document.
*
* @param doc (Document)
* @return doc (Document)
*******************************************/
public Document convert(Document doc) {
parseTree = new ParserRuleContext();
SentenceList = new ArrayList();
getNodes(doc.getRootElement(), parseTree);
return doc;
}
public String getStringTree() {
return parseTree.toStringTree();
}
public List getSentenceList() {
return SentenceList;
}
/**********************************************
* A recursive function that goes through the leaves of the tree to create
* XML nodes.
*
* @param docElement (Element)
* @param treeNode (ParserRuleContext)
**********************************************/
public void getNodes(Element docElement, ParserRuleContext treeNode) {
int nodeCount = docElement.getChildCount();
for (int i = 0; i < nodeCount; i++) {
if (docElement.getChild(i) instanceof Text) {
//treeNode.addChild(new ParserRuleContext(docElement.getChild(i).getValue()));
treeNode.addChild(new ParserRuleContext());
}
else {
Element docChild = (Element) docElement.getChild(i);
String name = docChild.getLocalName();
if (docChild.getAttribute("type") != null) {
name = docChild.getLocalName() + "_"
+ docChild.getAttributeValue("type");
}
ParserRuleContext subTree = new ParserRuleContext();
treeNode.addChild(subTree);
getNodes(docChild, subTree);
if (name.startsWith("Sentence")
&& !(docChild.toXML().contains("Unmatched"))) {
SentenceList.add(subTree.toStringTree());
}
}
}
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser/ 0000775 0000000 0000000 00000000000 14163277713 0032362 5 ustar 00root root 0000000 0000000 AnalysePatents.java 0000664 0000000 0000000 00000007170 14163277713 0036106 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger.modelParser;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import nu.xom.Builder;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.ParsingException;
import nu.xom.ValidityException;
import org.apache.log4j.Logger;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
/****************************************************
* First steps at creating a corpus for the ChemicalTagger model parser.
* @author lezan
*
***************************************************/
public class AnalysePatents {
private static final Logger LOG = Logger.getLogger(AnalysePatents.class);
public List getExperimentalParagraphs(String sourceFile)
throws ValidityException, ParsingException, IOException,
SAXException {
Document doc = null;
List docList = new ArrayList();
LOG.info("Extracting paragraphs from " + sourceFile);
boolean experimentalStart = false;
XMLReader xmlReader;
xmlReader = XMLReaderFactory.createXMLReader();
xmlReader
.setFeature(
"http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
doc = new Builder(xmlReader).build(sourceFile);
Element rootElement = (Element) doc.query("//description").get(0);
for (int i = 0; i < rootElement.getChildCount(); i++) {
if (!(rootElement.getChild(i) instanceof nu.xom.Text) && !(rootElement.getChild(i) instanceof nu.xom.ProcessingInstruction)) {
Element childElement = (Element) rootElement.getChild(i);
if (experimentalStart
&& childElement.getLocalName().equalsIgnoreCase(
"heading")
&& !childElement.getValue().startsWith("EXAMPLE")) {
experimentalStart = false;
} else if (experimentalStart && childElement.getLocalName().equals("p")) {
System.out.println(childElement.toXML());
docList.add(new Document((Element)childElement.copy()));
}
else if (childElement.getLocalName().equalsIgnoreCase("heading")
&& childElement.getValue().startsWith("EXAMPLE")) {
experimentalStart = true;
}
}
}
return docList;
}
/************************************
* Main class.
* @param args
* @throws IOException
* @throws ValidityException
* @throws ParsingException
* @throws SAXException
*/
public static void main(String[] args) throws IOException,
ValidityException, ParsingException, SAXException {
String path = args[0];
File patentDirectory = new File(path);
String[] patentDir = patentDirectory.list();
for (String file : patentDir) {
String resourcePath = path + file;
AnalysePatents extract = new AnalysePatents();
List docList = extract.getExperimentalParagraphs(resourcePath);
int index = 0;
for (Document document : docList) {
Utils.writeXMLToFile(document,
"target/" + file.replace(".xml", "") + index + ".xml");
index += 1;
}
}
}
}
ChemistryModelParser.java 0000664 0000000 0000000 00000003743 14163277713 0037262 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger.modelParser;
import java.io.IOException;
import java.io.InputStream;
import nu.xom.Document;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
/************************************************
*
* Runs the ChemicalParser model against text.
*
* @author lh359
************************************************/
public class ChemistryModelParser {
private Parser parser;
public ChemistryModelParser() {
InputStream modelIn = null;
ParserModel model = null;
try {
modelIn = Utils
.getInputStream(getClass(),
"/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser/chem-parser-chunking.bin");
model = new ParserModel(modelIn);
} catch (IOException e) {
e.printStackTrace();
}
if (model != null)
parser = ParserFactory.create(model);
else
throw new NullPointerException();
}
public void parseSentence(String sentence) {
Parse[] topParses = ParserTool.parseLine(sentence, parser, 1);
ParseTreetoXML parseToXML = new ParseTreetoXML();
for (Parse parse : topParses) {
Document doc = parseToXML.convert(parse);
Utils.writeXMLToFile(doc,"target/testParse.xml");
}
}
}
CreateTreeBank.java 0000664 0000000 0000000 00000006701 14163277713 0035771 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger.modelParser;
import java.io.File;
import java.io.IOException;
import nu.xom.Builder;
import nu.xom.Document;
import nu.xom.Node;
import nu.xom.Nodes;
import nu.xom.ParsingException;
import nu.xom.Text;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
import uk.ac.cam.ch.wwmm.chemicaltagger.XMLtoAST;
/********************************************************************
* Creates a Treebank from the rule-based ChemicalTagger output
* @author lh359
********************************************************************/
public class CreateTreeBank {
private static final Logger LOG = Logger.getLogger(CreateTreeBank.class);
/************************************************
* Extracts the content from the patent files.
* @param sourceFile
* @return
*/
public String getContent(String sourceFile) {
Builder builder = new Builder();
Document doc = null;
LOG.info("Extracting data from "+sourceFile);
String content = "";
try {
doc = builder.build(sourceFile);
Nodes sections = doc.query("//p");
for (int i = 0; i < sections.size(); i++) {
Node node = sections.get(i);
for (int j = 0; j < node.getChildCount(); j++) {
if (node.getChild(j) instanceof Text) {
String cleanNode = node.getChild(j).getValue().trim();
if (cleanNode.toLowerCase().startsWith("tlc") ||cleanNode.toLowerCase().startsWith("mass sp")) {
break;
} else {
content = content + " " + cleanNode;
}
}
}
}
String spectra = "";
Nodes spectrum = doc.query("//spectrum");
for (int i = 0; i < spectrum.size(); i++) {
String cleanSpectrum = spectrum.get(i).getValue().trim().replace("\n", "");
if (StringUtils.isNotEmpty(cleanSpectrum)) {
spectra = spectra + " " + cleanSpectrum;
}
}
} catch (ParsingException ex) {
LOG.fatal("ParsingException " + ex.getMessage(),
new RuntimeException());
} catch (IOException ex) {
LOG.fatal(ex.getMessage(), new RuntimeException());
}
return content;
}
/******************************************************
* Main class.
* @param args
* @throws IOException
****************************************************/
public static void main (String[] args) throws IOException {
String path = args[0];
File patentDirectory = new File(path);
String[] patentDir = patentDirectory.list();
XMLtoAST xmlAst = new XMLtoAST();
for (String file : patentDir) {
String resourcePath = path + file;
CreateTreeBank extract = new CreateTreeBank();
String content = extract.getContent(resourcePath);
Document doc = Utils.runChemicalTagger(content);
xmlAst.convert(doc);
Utils.writeListToFile(xmlAst.getSentenceList(),"target/"+file.replace("xml", "txt"));
}
}
}
ParseTreetoXML.java 0000664 0000000 0000000 00000004640 14163277713 0035770 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser /**
* Copyright 2012 Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.cam.ch.wwmm.chemicaltagger.modelParser;
import nu.xom.Document;
import nu.xom.Element;
import opennlp.tools.parser.Parse;
import org.apache.commons.lang.StringUtils;
/*****************************
* Converts OpenNLP Parse Trees into XML Documents.
*
* @author lh359
*****************************/
public class ParseTreetoXML {
/********************************************
* Default constructor method.
*******************************************/
public ParseTreetoXML() {
}
/********************************************
* Converts Parse Trees to XML Document.
*
* @param parseTree
* (Tree)
*******************************************/
public Document convert(Parse parseTree) {
Element root = new Element("Document");
Document doc;
if (parseTree.getChildCount() > 0) {
if (StringUtils.isNotEmpty(parseTree.getText())) {
Element sentenceNode = new Element("Sentence");
root.appendChild(getNodes(parseTree, sentenceNode));
doc = new Document(root);
} else {
doc = new Document(getNodes(parseTree, root));
}
} else {
doc = new Document(root);
}
return doc;
}
/**********************************************
* A recursive function that goes through the leaves of the tree to create
* XML nodes.
*
* @param parseTree (Tree)
* @param node (Element)
* @return node (Element)
**********************************************/
public Element getNodes(Parse parseTree, Element node) {
for (Parse treeNode : parseTree.getChildren()) {
if (treeNode.getChildCount() == 0) {
String text = treeNode.toString();
node.appendChild(text);
} else {
Element newNode = new Element(treeNode.getType());
node.appendChild(newNode);
getNodes(treeNode, newNode);
}
}
return node;
}
}
chemicaltagger-chemicalTagger-1.6.2/src/main/resources/ 0000775 0000000 0000000 00000000000 14163277713 0023114 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/antlr/ 0000775 0000000 0000000 00000000000 14163277713 0024234 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/antlr/chemicalInput.txt 0000664 0000000 0000000 00000002610 14163277713 0027561 0 ustar 00root root 0000000 0000000 NN-SYNTHESIZE Synthesis IN-OF of OSCAR-CD 3 STOP . DT A NN-MIXTURE mixture IN-OF of OSCAR-CD 2 -LRB- ( CD 1.1 NN-GRAM g COMMA , CD 3.6 NN-MOL mmol -RRB- ) COMMA , OSCAR-CM zinc OSCAR-CM oxide -LRB- ( CD 0.160 NN-GRAM g COMMA , CD 0.196 NN-MOL mmol -RRB- ) CC and OSCAR-CM acetic OSCAR-CM anhydride -LRB- ( CD 0.37 NN-GRAM g COMMA , CD 3.6 NN-MOL mmol -RRB- ) IN-IN in OSCAR-CM acetic OSCAR-CM acid -LRB- ( CD 4.3 NN-GRAM g COMMA , CD 72 NN-MOL mmol -RRB- ) VBD was VB-STIR stirred IN at CD 30 NN-TEMP °C IN-FOR for CD 2 NN-TIME d STOP . DT The NN-CHEMENTITY reaction VBD was VB-QUENCH quenched IN-WITH with OSCAR-CJ saturated OSCAR-CM NaHCO3 COMMA , CC and DT the NN-CHEMENTITY mixture VBD was VB-EXTRACT extracted IN-WITH with OSCAR-CM Et2O STOP . DT The JJ combined OSCAR-CJ organic NN-CHEMENTITY layer VBD was VB-WASH washed IN-WITH with OSCAR-CM H2O CC and VB-DRY dried IN-OVER over OSCAR-CJ anhydrous OSCAR-CM Na2SO4 STOP . DT The NN-CHEMENTITY solvent VBD was VB-REMOVE removed IN-IN in NN-VACUUM vacuo COMMA , CC and DT the NN-CHEMENTITY residue VBD was VB-PURIFY purified IN-BY by NN-FLASH flash NN-COLUMN column NN-CHROMATOGRAPHY chromatography IN-ON on OSCAR-CM silica NN-CHEMENTITY gel -LRB- ( OSCAR-CM hexane DASH / OSCAR-CM Et2O SYM = CD 2/1 -RRB- ) TO to VB-YIELD give CD 0.97 NN-GRAM g IN-OF of CD 3 IN-AS as DT a JJ pale JJ yellow JJ viscous NN-STATE liquid -LRB- ( CD 95 NN-PERCENT % -RRB- ) STOP . chemicaltagger-chemicalTagger-1.6.2/src/main/resources/header.txt 0000664 0000000 0000000 00000001140 14163277713 0025101 0 ustar 00root root 0000000 0000000 Copyright ${year} Lezan Hawizy, David M. Jessop, Daniel Lowe and Peter Murray-Rust
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ 0000775 0000000 0000000 00000000000 14163277713 0023533 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/ 0000775 0000000 0000000 00000000000 14163277713 0024116 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ 0000775 0000000 0000000 00000000000 14163277713 0024656 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ch/ 0000775 0000000 0000000 00000000000 14163277713 0025250 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ch/wwmm/ 0000775 0000000 0000000 00000000000 14163277713 0026237 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ch/wwmm/chemicaltagger/ 0000775 0000000 0000000 00000000000 14163277713 0031176 5 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser/0000775 0000000 0000000 00000000000 14163277713 0033453 5 ustar 00root root 0000000 0000000 chem-parser-chunking.bin 0000664 0000000 0000000 00000433671 14163277713 0040116 0 ustar 00root root 0000000 0000000 chemicaltagger-chemicalTagger-1.6.2/src/main/resources/uk/ac/cam/ch/wwmm/chemicaltagger/modelParser PK s> manifest.propertiesUAO0$;vI/#FȼWhBJ1Kl^}gGv%L8=$Wa{Jz4IyC>F4\ZƴPWeoZߖS q.QJfrWݸ8Ʃ*
N"4Γ2
F4;ۀRp$m;Ħ@i,YƟMt^N#PK < PK s> parsertager.postagger 9@ƿPK s> manifest.propertiesMͻ0@^_!H0زP!EB^)*<6߿6٨MXXhSuA[0F]HB=:8KnԳܟRzTa,YRu*