/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.commons.opennlp; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; import opennlp.tools.util.Span; /** * Simple version of a {@link opennlp.tools.chunker.Chunker} that uses the POS tags to build chunks. * It does not implement the {@link opennlp.tools.chunker.Chunker} interface because implementing * methods other than the {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])} * is not feasible.
* Defaults are based on the * Penn Treebank tag set * * TODO:
null
if no configuration for the
* parsed language is present in the {@link PosTagsCollectionEnum}.
*/
public static PosTypeChunker getInstance(String lang,double minPosTagProbaility){
Set
* Note that buildPosTypes are added to the followed once. Therefore the
* followPosTypes may or may not include some/all buildPosTypes.
* @param buildPosTypes the POS types that trigger a new Chunk (MUST NOT be
*
* This method is the equivalent to
* {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])}
* @param tokens the tokens
* @param tags the POS tags for the tokens
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[] tags) {
int consumed = -1;
List chunks = new ArrayList();
for(int i=0;i
* @param tokens the tokens
* @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags)
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[][] tags,double[][]props) {
//NOTE: this is a 1:1 copy of the above method!! However this is the
// only solution, because merging them into a single one would
// need to copy the Stirng[] of the other into a String[][1] as
// used by this one :(
// If someone has a better Idea feel free to change!
// Rupert Westenthaler (28.Sep.2011)
int consumed = -1;
List chunks = new ArrayList();
for(int i=0;inull
nor {@link Set#isEmpty() empty}).
* @param followPosTypes additional POS types followed to extend Chunks (MAY
* BE null
or empty).
*/
public PosTypeChunker(Setnull
if
* not available
* @param pos the POS tags
* @return true
if follow
*/
private boolean followPOS(double[] props,String... pos){
boolean reject = false;
for(int i=0;i