/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.commons.opennlp; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; import opennlp.tools.util.Span; /** * Simple version of a {@link opennlp.tools.chunker.Chunker} that uses the POS tags to build chunks. * It does not implement the {@link opennlp.tools.chunker.Chunker} interface because implementing * methods other than the {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])} * is not feasible.

* Defaults are based on the * Penn Treebank tag set * * TODO:

Test if POS tags are the same for different languages *
Check if it is possible to implement the {@link opennlp.tools.chunker.Chunker} interface *

* @author Rupert Westenthaler * @deprecated replaced by STANBOL-733 (stanbol nlp processing module * */ public class PosTypeChunker { private final double minPosProb; private final Set followTypes; private final Set buildTypes; /** * Creates an instance for the given language based on the configuration * within the {@link PosTagsCollectionEnum}. * @param lang The language * @param minPosTagProbaility The minimum probability of a POS tag so that * it is processed. In case of lower Probabilities POS tags are ignored and * assumed to be matching. * @return the instance or null if no configuration for the * parsed language is present in the {@link PosTagsCollectionEnum}. */ public static PosTypeChunker getInstance(String lang,double minPosTagProbaility){ Set nounPosTagCollection = PosTagsCollectionEnum.getPosTagCollection(lang, PosTypeCollectionType.NOUN); if(nounPosTagCollection != null && !nounPosTagCollection.isEmpty()){ return new PosTypeChunker(nounPosTagCollection, PosTagsCollectionEnum.getPosTagCollection( lang,PosTypeCollectionType.FOLLOW),minPosTagProbaility); } else { return null; } } /** * Initialise a new PosTypeChunker for the parsed POS tag collections. This * Constructor can be used if no predefined Configuration for a given * language is available in the {@link PosTagsCollectionEnum}

* Note that buildPosTypes are added to the followed once. Therefore the * followPosTypes may or may not include some/all buildPosTypes. * @param buildPosTypes the POS types that trigger a new Chunk (MUST NOT be * null nor {@link Set#isEmpty() empty}). * @param followPosTypes additional POS types followed to extend Chunks (MAY * BE null or empty). */ public PosTypeChunker(Set buildPosTypes,Set followPosTypes,double minPosProb){ if(buildPosTypes == null || buildPosTypes.isEmpty()){ throw new IllegalArgumentException("The set of POS types used to" + "build Chunks MUST NOT be NULL nor empty!"); } this.buildTypes = Collections.unmodifiableSet(new TreeSet(buildPosTypes)); Set follow = new TreeSet(); follow.addAll(buildTypes); if(followPosTypes != null){ follow.addAll(followPosTypes); } this.followTypes = Collections.unmodifiableSet(follow); if(minPosProb > 1){ throw new IllegalArgumentException("The minimum POS tag probalility MUST BE set to a value [0..1] or values < 0 to deactivate this feature (parsed="+minPosProb+")!"); } else { this.minPosProb = minPosProb; } } /** * @param props the probabilities of the pos tags or null if * not available * @param pos the POS tags * @return true if follow */ private boolean followPOS(double[] props,String... pos){ boolean reject = false; for(int i=0;i= minPosProb){ if(followTypes.contains(pos[i])){ return true; } else { reject = true; } } //else prob to low ... do not process } //in case we have not found a POS tag with a prob > minPosProb //return TRUE return !reject; } private boolean includePOS(double[] props,String... pos){ boolean reject = false; for(int i=0;i= minPosProb){ if(buildTypes.contains(pos[i])){ return true; } else { reject = true; } } } //in case we have not found a POS tag with a prob > minPosProb //return TRUE return !reject; } /** * The set of POS types followed to extend Chunks. This includes the * {@link #getChunkPosTypes()} values * @return the followTypes */ public final Set getFollowedPosTypes() { return followTypes; } /** * The set of POS types used to create Chunks * @return the buildTypes */ public final Set getChunkPosTypes() { return buildTypes; } /** * Build the chunks based on the parsed tokens and POS tags.

* This method is the equivalent to * {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])} * @param tokens the tokens * @param tags the POS tags for the tokens * @return the chunks as spans over the parsed tokens */ public Span[] chunkAsSpans(String[] tokens, String[] tags) { int consumed = -1; List chunks = new ArrayList(); for(int i=0;i consumed && followPOS(null,tags[start-1])){ start--; //follow backwards until consumed } int followEnd = i; int end = i; while(followEnd+1 < tokens.length && followPOS(null,tags[followEnd+1])){ followEnd++; //follow if(includePOS(null,tags[followEnd])){ end = followEnd; //extend end only if act is include } } chunks.add(new Span(start,end)); // consumed = end; i = followEnd; }//build no chunk for this token } return chunks.toArray(new Span[chunks.size()]); } /** * Build the chunks based on the parsed tokens and the one or more detected * POS tags alternatives for the tokens.

* @param tokens the tokens * @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags) * @return the chunks as spans over the parsed tokens */ public Span[] chunkAsSpans(String[] tokens, String[][] tags,double[][]props) { //NOTE: this is a 1:1 copy of the above method!! However this is the // only solution, because merging them into a single one would // need to copy the Stirng[] of the other into a String[][1] as // used by this one :( // If someone has a better Idea feel free to change! // Rupert Westenthaler (28.Sep.2011) int consumed = -1; List chunks = new ArrayList(); for(int i=0;i consumed && followPOS(props[start-1],tags[start-1])){ start--; //follow backwards until consumed } int followEnd = i; int end = i; while(followEnd+1 < tokens.length && followPOS(props[followEnd+1],tags[followEnd+1])){ followEnd++; //follow if(includePOS(props[followEnd],tags[followEnd])){ end = followEnd; //extend end only if act is include } } chunks.add(new Span(start,end)); // consumed = end; i = followEnd; }//build no chunk for this token } return chunks.toArray(new Span[chunks.size()]); } }