# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Application configuration file in HOCON format (Human-Optimized Config Object Notation). # HOCON syntax is defined at http://github.com/typesafehub/config/blob/master/HOCON.md # and also used by Akka (http://www.akka.io) and Play (http://www.playframework.org/). # For more examples see http://doc.akka.io/docs/akka/2.1.2/general/configuration.html # morphline.conf example file # this is a comment // this is yet another comment morphlines : [ { id : morphline1 importCommands : ["org.kitesdk.**", "org.apache.solr.**"] commands : [ { separateAttachments {} } # java command that doesn't do anything except for test compilation { java { imports : "import java.util.*;" code: """ List tags = record.get("javaWithImports"); return child.process(record); """ } } # java command that doesn't do anything except for test compilation { java { code: """ List tags = record.get("javaWithoutImports"); return child.process(record); """ } } { # used for auto-detection if MIME type isn't explicitly supplied detectMimeType { includeDefaultMimeTypes : true mimeTypesFiles : ["RESOURCES_DIR/custom-mimetypes.xml"] } } { tryRules { throwExceptionIfAllRulesFailed : true rules : [ # next top-level rule: { commands : [ { logDebug { format : "hello unpack" } } { unpack {} } { generateUUID {} } { callParentPipe {} } ] } { commands : [ { logDebug { format : "hello decompress" } } { decompress {} } { callParentPipe {} } ] } { commands : [ { readCSV { supportedMimeTypes : [text/csv] charset : UTF-8 ignoreFirstLine : false columns : [ user_screen_name, text ] } } { generateUUID { field : id preserveExisting : false } } { sanitizeUnknownSolrFields { solrLocator : ${SOLR_LOCATOR} } } ] } { commands : [ { readAvroContainer { supportedMimeTypes : [avro/binary] # readerSchemaString : "" # optional, avro json schema blurb for getSchema() # readerSchemaFile : /path/to/syslog.avsc } } { extractAvroTree {} } { setValues { id : "@{/id}" user_screen_name : "@{/user_screen_name}" text : "@{/text}" } } { sanitizeUnknownSolrFields { solrLocator : ${SOLR_LOCATOR} } } ] } { commands : [ { readJsonTestTweets { supportedMimeTypes : ["mytwittertest/json+delimited+length"] } } { sanitizeUnknownSolrFields { solrLocator : ${SOLR_LOCATOR} } } ] } # next top-level rule: { commands : [ { logDebug { format : "hello solrcell" } } { # wrap SolrCell around an HTML Tika parser solrCell { solrLocator : ${SOLR_LOCATOR} # captureAttr : true # default is false capture : [ # twitter feed schema user_friends_count user_location user_description user_statuses_count user_followers_count user_name user_screen_name created_at text retweet_count retweeted in_reply_to_user_id source in_reply_to_status_id media_url_https expanded_url # file metadata file_download_url file_upload_url file_scheme file_host file_port file_path file_name file_length file_last_modified file_owner file_group file_permissions_user file_permissions_group file_permissions_other file_permissions_stickybit ] fmap : { content : text, content-type : content_type } # rename "content" field to "text" fields dateFormats : [ "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] # various java.text.SimpleDateFormat # xpath : "/xhtml:html/xhtml:body/xhtml:div/descendant:node()" uprefix : "ignored_" lowernames : true # solrContentHandlerFactory : org.apache.solr.tika.TrimSolrContentHandlerFactory # Tika parsers to be registered. If multiple parsers support the same MIME type, # the parser is chosen that is closest to the bottom in this list: parsers : [ { parser : org.apache.tika.parser.asm.ClassParser } # { parser : org.apache.tika.parser.AutoDetectParser } # { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] } { parser : org.gagravarr.tika.FlacParser } { parser : org.apache.tika.parser.audio.AudioParser } { parser : org.apache.tika.parser.audio.MidiParser } { parser : org.apache.tika.parser.crypto.Pkcs7Parser } { parser : org.apache.tika.parser.dwg.DWGParser } { parser : org.apache.tika.parser.epub.EpubParser } { parser : org.apache.tika.parser.executable.ExecutableParser } { parser : org.apache.tika.parser.feed.FeedParser } { parser : org.apache.tika.parser.font.AdobeFontMetricParser } { parser : org.apache.tika.parser.font.TrueTypeParser } { parser : org.apache.tika.parser.xml.XMLParser } { parser : org.apache.tika.parser.html.HtmlParser } { parser : org.apache.tika.parser.image.ImageParser } { parser : org.apache.tika.parser.image.PSDParser } { parser : org.apache.tika.parser.image.TiffParser } { parser : org.apache.tika.parser.iptc.IptcAnpaParser } { parser : org.apache.tika.parser.iwork.IWorkPackageParser } { parser : org.apache.tika.parser.jpeg.JpegParser } { parser : org.apache.tika.parser.mail.RFC822Parser } { parser : org.apache.tika.parser.mbox.MboxParser, additionalSupportedMimeTypes : [message/x-emlx] } { parser : org.apache.tika.parser.microsoft.OfficeParser } { parser : org.apache.tika.parser.microsoft.TNEFParser } { parser : org.apache.tika.parser.microsoft.ooxml.OOXMLParser } { parser : org.apache.tika.parser.mp3.Mp3Parser } { parser : org.apache.tika.parser.mp4.MP4Parser } { parser : org.apache.tika.parser.hdf.HDFParser } { parser : org.apache.tika.parser.netcdf.NetCDFParser } { parser : org.apache.tika.parser.odf.OpenDocumentParser } { parser : org.apache.tika.parser.pdf.PDFParser } { parser : org.apache.tika.parser.pkg.CompressorParser } { parser : org.apache.tika.parser.pkg.PackageParser } { parser : org.apache.tika.parser.rtf.RTFParser } { parser : org.apache.tika.parser.txt.TXTParser } { parser : org.apache.tika.parser.video.FLVParser } { parser : org.apache.tika.parser.xml.DcXMLParser } { parser : org.apache.tika.parser.xml.FictionBookParser } { parser : org.apache.tika.parser.chm.ChmParser } #{ parser : org.apache.tika.parser.AutoDetectParser } ] } } { generateUUID { field : ignored_base_id } } { generateSolrSequenceKey { baseIdField: ignored_base_id solrLocator : ${SOLR_LOCATOR} } } ] } ] } } { loadSolr { solrLocator : ${SOLR_LOCATOR} } } { logDebug { format : "My output record: {}" args : ["@{}"] } } ] } ]