/*
* Copyright: (c) 2009 Mayo Foundation for Medical Education and
* Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
* triple-shield Mayo logo are trademarks and service marks of MFMER.
*
* Except as contained in the copyright notice above, or as used to identify
* MFMER as the author of this software, the trade names, trademarks, service
* marks, or product names of the copyright holder shall not be used in
* advertising, promotion or otherwise in connection with this software without
* prior written authorization of the copyright holder.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data.pos.dictionary;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Collections;
import java.util.HashMap;
import java.util.ArrayList;
/**
* From a POS corpus in OpenNLP format, create a list of the POS tags found within the corpus
*
Outputs the list of tags to stdout, and for each tag, outputs one word/token that
* had been tagged with that tag
* @author Mayo Clinic
*/
public class ListTags {
/*
* writes the list of tags to stdout, together with an example / a word found tagged with that tag
*/
private static void writeTagList(File f, HashMap tagList) throws IOException {
// sort them before outputting them
ArrayList list = new ArrayList();
for (Object key : tagList.keySet()) {
list.add(key.toString());
}
Collections.sort(list);
// output to stdout
System.out.println("\nFor file " + f.getName() + ":");
for (String s : list) {
System.out.println(s + "\t which was a tag for '" + tagList.get(s)+ "'"); // output the tagList entry to stdout
}
}
// Use a HashMap so we can keep an example, for each tag, of what was tagged with the tag
private static HashMap createTagList(BufferedReader br) throws IOException {
HashMap tagList;
tagList = new HashMap (100); // initial size is arbitrary
String line;
String tag;
int pos; // position of last underscore
String taggedThing;
while((line = br.readLine()) != null) {
for (String token : line.split(" ")) {
pos = token.lastIndexOf('_');
if (pos < 0) {
System.err.println("ERROR: didn't find underscore within '" + token + "'");
}
taggedThing = token.substring(0, pos);
tag = token.substring(pos+1);
if (tagList.get(tag)==null) {
tagList.put(tag, taggedThing);
}
else {
// System.out.println(tag + " already was seen for " + taggedThing);
}
}
}
return tagList;
}
private static BufferedReader getBufferedReader(String filename) throws FileNotFoundException {
File f = new File(filename);
Reader r;
try {
r = new FileReader(f);
} catch (FileNotFoundException e) {
System.err.println("Error reading from file " + filename);
throw e;
}
return new BufferedReader(r);
}
/**
* Read a file containing POS-tagged tokens in OpenNLP format,
* and output to stdout the list of tags found
* Example input:
*
body_NN
*
winning_VBG
*
body_NN
* Example output:
*
NN
*
VBG
* @param args args[0] is required - the name of the input file containing
* POS-tagged tokens in OpenNLP format.
*
E.g. data/pos/ptb-pos-training.txt
*
*/
public static void main(String[] args) {
if (args[0]==null || args[0].length()==0) {
System.err.println("ERROR: corpus name required");
return;
}
String arg0 = args[0].trim();
if (arg0.equals("-h") || (arg0.equals("--help"))) {
System.out.println("Usage: java ListTags ");
System.out.println(" where is something like data/pos/ptb-pos-training.txt");
System.out.println("Usage: java ListTags ");
System.out.println(" where is something like data/pos/");
return;
}
String inputPath = args[0];
File f = new File(inputPath);
File [] files; // list of files to process
if (f.isDirectory()) { // directory name was input
files = f.listFiles(); // process all within the dir
}
else { // name of a regular file was input
files = new File[1];
files[0] = f;
}
HashMap tagList;
try {
for (File file : files) {
if (file.isDirectory()) continue; // skip subdirectories
if (file.getName().endsWith(".lnk")) continue; // skip shortcuts
BufferedReader br = getBufferedReader(file.getAbsolutePath());
tagList = createTagList(br);
writeTagList(file, tagList);
}
} catch (IOException e) {
System.err.println("Failed");
}
}
}