/* * Copyright: (c) 2009 Mayo Foundation for Medical Education and * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the * triple-shield Mayo logo are trademarks and service marks of MFMER. * * Except as contained in the copyright notice above, or as used to identify * MFMER as the author of this software, the trade names, trademarks, service * marks, or product names of the copyright holder shall not be used in * advertising, promotion or otherwise in connection with this software without * prior written authorization of the copyright holder. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package data.pos.training; import static org.junit.Assert.assertEquals; import org.jdom.JDOMException; import org.junit.Test; import data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract; import data.pos.training.GeniaPosTrainingDataExtractor.TaggedSentence; import data.pos.training.GeniaPosTrainingDataExtractor.TaggedWord; public class GeniaPosTrainingDataExtractorTests { @Test public void test() throws JDOMException { GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor("test/data/GENIAcorpus3.02.pos.test.xml"); TaggedAbstract taggedAbstract = gptde.next(); TaggedSentence taggedSentence; TaggedWord taggedWord; //test one full sentence from title taggedSentence = taggedAbstract.getTaggedSentences().get(0); taggedWord = taggedSentence.getTaggedWords().get(0); assertEquals("Pancreatic", taggedWord.getWord()); assertEquals("JJ", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(1); assertEquals("development", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(2); assertEquals("and", taggedWord.getWord()); assertEquals("CC", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(3); assertEquals("maturation", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(4); assertEquals("of", taggedWord.getWord()); assertEquals("IN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(5); assertEquals("the", taggedWord.getWord()); assertEquals("DT", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(6); assertEquals("islet", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(7); assertEquals("B", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(8); assertEquals("cell", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(9); assertEquals(".", taggedWord.getWord()); assertEquals(".", taggedWord.getTag()); //test one full sentence from abstract //The three compartments are thought to be of common endodermal origin; in contrast to earlier hypotheses, which suggested that the endocrine compartment was of neuroectodermal origin. taggedSentence = taggedAbstract.getTaggedSentences().get(5); taggedWord = taggedSentence.getTaggedWords().get(0); assertEquals("The", taggedWord.getWord()); assertEquals("DT", taggedWord.getTag()); taggedSentence = taggedAbstract.getTaggedSentences().get(1); assertEquals(6, taggedSentence.getTaggedWords().size()); taggedSentence = taggedAbstract.getTaggedSentences().get(2); taggedWord = taggedSentence.getTaggedWords().get(0); assertEquals("Pancreas", taggedWord.getWord()); assertEquals("NN", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(11); assertEquals("anlage", taggedWord.getWord()); assertEquals("NNS", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(17); assertEquals(".", taggedWord.getWord()); assertEquals(".", taggedWord.getTag()); taggedAbstract = gptde.next(); taggedSentence = taggedAbstract.getTaggedSentences().get(4); taggedWord = taggedSentence.getTaggedWords().get(0); assertEquals("We", taggedWord.getWord()); assertEquals("PRP", taggedWord.getTag()); taggedWord = taggedSentence.getTaggedWords().get(37); assertEquals("non-octamer", taggedWord.getWord()); assertEquals("JJ", taggedWord.getTag()); } }