LWFTAdaptor

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.chukwa.datacollection.adaptor.filetailer;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.chukwa.ChunkImpl;
import org.apache.hadoop.chukwa.datacollection.ChunkReceiver;
import org.apache.hadoop.chukwa.datacollection.adaptor.AbstractAdaptor;
import org.apache.hadoop.chukwa.datacollection.adaptor.AdaptorException;
import org.apache.hadoop.chukwa.datacollection.adaptor.AdaptorShutdownPolicy;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;

/**
 * A base class for file tailing adaptors.
 * Intended to mandate as little policy as possible, and to use as
 * few system resources as possible.
 *
 *
 * If the file does not exist, this class will continue to retry quietly
 * forever and will start tailing if it's eventually created.
 */
public class LWFTAdaptor extends AbstractAdaptor {

  /**
   * This is the maximum amount we'll read from any one file before moving on to
   * the next. This way, we get quick response time for other files if one file
   * is growing rapidly.
   *
   */
  public static final int DEFAULT_MAX_READ_SIZE = 128 * 1024;
  public static final String MAX_READ_SIZE_OPT =
      "chukwaAgent.fileTailingAdaptor.maxReadSize";

  public static int MAX_READ_SIZE = DEFAULT_MAX_READ_SIZE;

  static Logger log;
  protected static FileTailer tailer;

  static {
    tailer = null;
    log = Logger.getLogger(FileTailingAdaptor.class);
  }


  /**
   * next PHYSICAL offset to read
   */
  protected long fileReadOffset;

  /**
   * The logical offset of the first byte of the file
   */
  protected long offsetOfFirstByte = 0;
  protected Configuration conf = null;

  File toWatch;

  @Override
  public void start(long offset) {
    synchronized(LWFTAdaptor.class) {
      if(tailer == null)
        tailer = new FileTailer(control.getConfiguration());
    }
    this.fileReadOffset = offset - offsetOfFirstByte;
    tailer.startWatchingFile(this);
  }

  /**
   * @see org.apache.hadoop.chukwa.datacollection.adaptor.Adaptor#getCurrentStatus()
   */
  public String getCurrentStatus() {
    return type.trim() + " " + offsetOfFirstByte + " " + toWatch.getPath();
  }

  public String toString() {
    return "Lightweight Tailer on " + toWatch;
  }

  public String getStreamName() {
    return toWatch.getPath();
  }

  @Override
  public String parseArgs(String params) {
    conf = control.getConfiguration();
    MAX_READ_SIZE = conf.getInt(MAX_READ_SIZE_OPT, DEFAULT_MAX_READ_SIZE);

    Pattern cmd = Pattern.compile("(\\d+)\\s+(.+)\\s?");
    Matcher m = cmd.matcher(params);
    if (m.matches()) { //check for first-byte offset. If absent, assume we just got a path.
      offsetOfFirstByte = Long.parseLong(m.group(1));
      toWatch = new File(m.group(2));
    } else {
      toWatch = new File(params.trim());
    }
    return toWatch.getAbsolutePath();
  }

  @Override
  public long shutdown(AdaptorShutdownPolicy shutdownPolicy)
      throws AdaptorException {
    tailer.stopWatchingFile(this);
    return fileReadOffset + offsetOfFirstByte;
  }


  /**
   * Extract records from a byte sequence
   *
   * @param eq the queue to stick the new chunk[s] in
   * @param buffOffsetInFile the byte offset in the stream at which buf[] begins
   * @param buf the byte buffer to extract records from
   * @return the number of bytes processed
   * @throws InterruptedException
   */
  protected int extractRecords(ChunkReceiver eq, long buffOffsetInFile,
      byte[] buf) throws InterruptedException {
    if(buf.length == 0)
      return 0;

    ChunkImpl chunk = new ChunkImpl(type, toWatch.getAbsolutePath(),
        buffOffsetInFile + buf.length, buf, this);

    eq.add(chunk);
    return buf.length;
  }

  protected boolean slurp(long len, RandomAccessFile reader) throws IOException,
  InterruptedException{
    boolean hasMoreData = false;

    log.debug("Adaptor|" + adaptorID + "|seeking|" + fileReadOffset);
    reader.seek(fileReadOffset);

    long bufSize = len - fileReadOffset;

   if (bufSize > MAX_READ_SIZE) {
      bufSize = MAX_READ_SIZE;
      hasMoreData = true;
    }
    byte[] buf = new byte[(int) bufSize];

    long curOffset = fileReadOffset;

    int bufferRead = reader.read(buf);
    assert reader.getFilePointer() == fileReadOffset + bufSize : " event size arithmetic is broken: "
        + " pointer is "
        + reader.getFilePointer()
        + " but offset is "
        + fileReadOffset + bufSize;

    int bytesUsed = extractRecords(dest,
        fileReadOffset + offsetOfFirstByte, buf);

    // === WARNING ===
    // If we couldn't found a complete record AND
    // we cannot read more, i.e bufferRead == MAX_READ_SIZE
    // it's because the record is too BIG
    // So log.warn, and drop current buffer so we can keep moving
    // instead of being stopped at that point for ever
    if (bytesUsed == 0 && bufferRead == MAX_READ_SIZE) {
      log.warn("bufferRead == MAX_READ_SIZE AND bytesUsed == 0, dropping current buffer: startOffset="
              + curOffset
              + ", MAX_READ_SIZE="
              + MAX_READ_SIZE
              + ", for "
              + toWatch.getPath());
      bytesUsed = buf.length;
    }

    fileReadOffset = fileReadOffset + bytesUsed;

    log.debug("Adaptor|" + adaptorID + "|start|" + curOffset + "|end|"
        + fileReadOffset);
    return hasMoreData;
  }

  public synchronized boolean tailFile()
  throws InterruptedException {
    boolean hasMoreData = false;
    try {

       //if file doesn't exist, length =0 and we just keep waiting for it.
      //if(!toWatch.exists())
      //  deregisterAndStop(false);

      long len = toWatch.length();
      if(len < fileReadOffset) {
        //file shrank; probably some data went missing.
        handleShrunkenFile(len);
      } else if(len > fileReadOffset) {
        RandomAccessFile reader = new RandomAccessFile(toWatch, "r");
        slurp(len, reader);
        reader.close();
      }
    } catch(IOException e) {
      log.warn("IOException in tailer", e);
      deregisterAndStop();
    }

    return hasMoreData;
  }

  private void handleShrunkenFile(long measuredLen) {
    log.info("file "+ toWatch +"shrank from " + fileReadOffset + " to " + measuredLen);
    offsetOfFirstByte = measuredLen;
    fileReadOffset = 0;
  }

}