Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
TikaEncodingDetector |
|
| 2.0;2 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.encoding; | |
19 | ||
20 | import org.apache.tika.parser.txt.CharsetDetector; | |
21 | import org.apache.tika.parser.txt.CharsetMatch; | |
22 | ||
23 | import java.io.BufferedInputStream; | |
24 | import java.io.IOException; | |
25 | import java.io.InputStream; | |
26 | ||
27 | /** | |
28 | * An implementation of {@link EncodingDetector} based on | |
29 | * <a href="http://tika.apache.org/">Apache Tika</a>. | |
30 | * | |
31 | * @author Michele Mostarda ( michele.mostarda@gmail.com ) | |
32 | * @author Davide Palmisano ( dpalmisano@gmail.com ) | |
33 | * @version $Id$ | |
34 | */ | |
35 | 0 | public class TikaEncodingDetector implements EncodingDetector { |
36 | ||
37 | public String guessEncoding(InputStream is) throws IOException { | |
38 | 0 | CharsetDetector charsetDetector = new CharsetDetector(); |
39 | 0 | charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) ); |
40 | 0 | charsetDetector.enableInputFilter(true); |
41 | 0 | CharsetMatch cm = charsetDetector.detect(); |
42 | 0 | return cm.getName(); |
43 | } | |
44 | ||
45 | } |