# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Application configuration file in HOCON format (Human-Optimized Config Object Notation). # HOCON syntax is defined at http://github.com/typesafehub/config/blob/master/HOCON.md # and also used by Akka (http://www.akka.io) and Play (http://www.playframework.org/). # For more examples see http://doc.akka.io/docs/akka/2.1.2/general/configuration.html # morphline.conf example file # this is a comment // this is yet another comment morphlines : [ { id : morphline1 importCommands : ["org.kitesdk.**", "org.apache.solr.**"] commands : [ { separateAttachments {} } # java command that doesn't do anything except for test compilation { java { imports : "import java.util.*;" code: """ List tags = record.get("javaWithImports"); return child.process(record); """ } } # java command that doesn't do anything except for test compilation { java { code: """ List tags = record.get("javaWithoutImports"); return child.process(record); """ } } { # auto-detect MIME type if it isn't explicitly supplied detectMimeType { includeDefaultMimeTypes : true } } { tryRules { throwExceptionIfAllRulesFailed : true rules : [ # next top-level rule: { commands : [ { logDebug { format : "hello unpack" } } { unpack {} } { callParentPipe {} } ] } { commands : [ { logDebug { format : "hello decompress" } } { decompress {} } { callParentPipe {} } ] } # next top-level rule: { commands : [ { logDebug { format : "hello solrcell" } } { # wrap SolrCell around a JPG Tika parser solrCell { solrLocator : ${SOLR_LOCATOR} captureAttr : true # default is false capture : [content, a, h1, h2] # extract some fields fmap : { exif_image_height : text, a : anchor, h1 : heading1 } # rename some fields dateFormats : [ "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] # various java.text.SimpleDateFormat xpath : "/xhtml:html/xhtml:body/xhtml:div/descendant:node()" uprefix : "ignored_" lowernames : true solrContentHandlerFactory : org.apache.solr.morphlines.cell.TrimSolrContentHandlerFactory parsers : [ # nested Tika parsers { parser : org.apache.tika.parser.jpeg.JpegParser } ] } } { logDebug { format : "solrcell output: {}", args : ["@{}"] } } ] } ] } } { generateUUID { field : ignored_base_id } } { generateSolrSequenceKey { baseIdField: ignored_base_id solrLocator : ${SOLR_LOCATOR} } } { loadSolr { solrLocator : ${SOLR_LOCATOR} } } { logDebug { format : "My output record: {}" args : ["@{}"] } } ] } ]