/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.commons.opennlp; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk; /** * Enumeration with pre-configured sets of POS tags for finding nouns, verbs ... * in different languages * @author Rupert Westenthaler * @deprecated replaced by STANBOL-733 (stanbol nlp processing module */ public enum PosTagsCollectionEnum { /** * Nouns related POS types for English based on the * * Penn Treebank tag set. *
* NOTE the "``" tag is also added as noun, because it can not be found in * the official tag set and is sometimes used to tag nouns. */ EN_NOUN("en",PosTypeCollectionType.NOUN,"NN","NNP","NNPS","NNS","FW","CD","``"), /** * Verb related POS types for English based on the * * Penn Treebank tag set */ EN_VERB("en",PosTypeCollectionType.VERB,"VB","VBD","VBG","VBN","VBP","VBZ"), /** * POS types one needs typically to follow to build {@link Chunk}s over * Nouns (e.g. "University_NN of_IN Otago_NNP" or "Geneva_NNP ,_, Ohio_NNP"). * For English and based on the * * Penn Treebank tag set */ EN_FOLLOW("en",PosTypeCollectionType.FOLLOW,"#","$"," ","(",")",",",".",":","POS","IN","JJ"), /** * Noun related POS types for German based on the * * STTS Tag Set */ DE_NOUN("de",PosTypeCollectionType.NOUN,"NN","NE","FM","XY"), /** * Verb related POS types for German based on the * * STTS Tag Set */ DE_VERB("de",PosTypeCollectionType.VERB,"VVFIN","VVIMP","VVINF","VVIZU","VVPP","VAFIN","VAIMP","VAINF", "VAPP","VMFIN","VMINF","VMPP"), /** * POS types one needs typically to follow to build {@link Chunk}s over * Nouns (e.g. "University_NN of_IN Otago_NNP" or "Geneva_NNP ,_, Ohio_NNP"). * For German based on the * * STTS Tag Set */ DE_FOLLOW("de",PosTypeCollectionType.FOLLOW,"$.","$,","$(","APPR"), /** * POS types representing Nouns for Danish based on the PAROLE Tagset as * described by this paper *
* TODO: Someone who speaks Danish should check this List * NOTES:
* TODO: Someone who speaks Danish should check this List */ DA_VERB("da",PosTypeCollectionType.VERB,"V","VA","VE"), /** * POS types that are followd to extend chunks for Danish based on the PAROLE Tagset as * described by this paper *
* TODO: Someone who speaks Danish should check this List
* NOTES:
* TODO: Someone who speaks this language should check this List
* NOTES: Currently this includes nouns, proper nouns and numbers. * In addition I added "vp". "vp" is not part of the POS tag set * documentation but in the training set there is a single occurrence * therefore the POS tagger sometimes do tag words with this tag. */ PT_NOUN("pt",PosTypeCollectionType.NOUN,"n","num","prop","vp"), /** * POS types for Verbs based on the * PALAVRAS tag set * for Portuguese.
* TODO: Someone who speaks this language should check this List
*/ PT_VERB("pt",PosTypeCollectionType.VERB,"v-pcp","v-fin","v-inf","v-ger"), /** * POS types followed to build Chunks based on the * PALAVRAS tag set * for Portuguese.
* TODO: Someone who speaks this language should check this List
* NOTES: Currently this pubctations and prepositions. */ PT_FOLLOW("pt",PosTypeCollectionType.FOLLOW,"punc", "prp"), /** * POS types for Nouns based on the WOTAN tagset for Dutch (as used with * Mbt).
* TODOO: Someone who speaks this language should checkthis List
* NOTES: This includes now Nouns, Numbers and "others". */ NL_NOUN("nl",PosTypeCollectionType.NOUN,"N","Num","Misc"), /** * POS types for Verbs based on the WOTAN tagset for Dutch (as used with * Mbt).
* The tagger does not distinguish the different forms fo verbs. Therefore * it is enough so include "V" */ NL_VERB("nl",PosTypeCollectionType.VERB,"V"), /** * POS types followed to build Chunks based on the WOTAN tagset for Dutch * (as used with Mbt).
* NOTES: THis includes only prepositions and punctuations * */ NL_FOLLOW("nl",PosTypeCollectionType.FOLLOW,"Punc","Prep"), /** * POS types for Nouns for Swedish language based on * * Lexical categories in MAMBA * NOTE:
* NOTES: this includes prepositions, Part of idiom, Infinitive marker * as well as all kinds of punctuations */ SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU"), /** * Nouns related POS types for Spanish language. * The description of the Tagset is available at * http://www.lsi.upc.edu/~nlp/SVMTool/parole.html */ ES_NOUN("es",PosTypeCollectionType.NOUN,"NC","NP","Z"), /** * Verb related POS types for Spanish language. * I was not able to find the list, so POS tag results where used to * create this configuration */ ES_VERB("es",PosTypeCollectionType.VERB,"VMI", "VMS", "VMM", "VMC", "VMN", "VMG", "VMP", "VAI", "VAS","VAM", "VAC", "VAN", "VAG", "VAP"), /** * POS types one needs typically to follow to build {@link Chunk}s over * Nouns (e.g. "University_NN of_IN Otago_NNP" or "Geneva_NNP ,_, Ohio_NNP"). * I was not able to find the list, so POS tag results where used to * create this configuration.
* For now "SP" and all "F*" tokens referring to '.', ';', ...
*/
ES_FOLLOW("es",PosTypeCollectionType.FOLLOW,"AQ","SP","Fc","Ft","Fp","Fe","Fd","Fx","Fat","Fit","Fpa","Fpt","Fg","Faa","Ft");
Setnull
if no configuration for the
* parsed parameters is available.
*/
public static Set