<?php
/**
 * File containing the ezcDocumentPcssParser class
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 * @package Document
 * @version //autogen//
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
 * @access private
 */

/**
 * Parser for simplified CSS rules for PDF layout specifications
 *
 * The EBNF for the parsed grammar is the following. The EBNF does not specify
 * the allowed comments, which are common C-style comments:
 *
 * <code>
 *  File        ::= Directive+
 *  Directive   ::= Address | Definition '{' Formatting* '}'
 *  Formatting  ::= Name ':' Value ';'
 *  Name        ::= [A-Za-z-]+
 *  Value       ::= QuotedValue | RawValue
 *  QuotedValue ::= '"' [^"]+ '"'
 *  RawValue    ::= [^;]+
 *
 *  Definition  ::= '@' [A-Za-z_-]+
 *
 *  Address     ::= Element ( Rule )*
 *  Rule        ::= '>'? Element
 *  Element     ::= ElementName ( '.' ClassName | '#' ElementId )
 *
 *  ClassName   ::= [A-Za-z_-]+
 *  ElementName ::= XMLName¹ | '*'
 *  ElementId   ::= XMLName¹
 *
 *  ¹ XMLName references to http://www.w3.org/TR/REC-xml/#NT-Name
 * </code>
 *
 * @package Document
 * @access private
 * @version //autogen//
 */
class ezcDocumentPcssParser extends ezcDocumentParser
{
    /**
     * Currently parsed file, stored for additional error context
     *
     * @var string
     */
    protected $file;

    /**
     * Expressions for tokenizing the strings.
     *
     * @var array
     */
    protected $expressions = array();

    /**
     * Tokens irrelevant to the parser, which will bee thrown away immediately
     *
     * @var array
     */
    protected $ignoreTokens = array(
        self::T_WHITESPACE,
        self::T_COMMENT,
    );

    /**
     * Names for the known tokens, for nicer error messages
     *
     * @var array
     */
    protected $tokenNames = array(
        self::T_WHITESPACE    => 'T_WHITESPACE',
        self::T_COMMENT       => 'T_COMMENT',
        self::T_ADDRESS       => 'T_ADDRESS (CSS element addressing queries)',
        self::T_DESC_ADDRESS  => 'T_DESC_ADDRESS (CSS element addressing queries)',
        self::T_ADDRESS_ID    => 'T_ADDRESS_ID (CSS element addressing queries)',
        self::T_ADDRESS_CLASS => 'T_ADDRESS_CLASS (CSS element addressing queries)',
        self::T_DEFINITION    => 'T_DEFINITION (CSS definition addressing element)',
        self::T_START         => 'T_START ("{")',
        self::T_END           => 'T_END ("}")',
        self::T_FORMATTING    => 'T_FORMATTING (formatting specification)',
        self::T_VALUE         => 'T_VALUE (formatting value definition)',
        self::T_EOF           => 'T_EOF (end of file)',
    );

    /**
     * Regular expression for characters a XML name may start with, as defined
     * at:
     *
     * http://www.w3.org/TR/REC-xml/#NT-NameStartChar
     */
    const XML_NAME_STARTCHAR = '(?:[:A-Za-z_])';
        // @todo: Integrate: |[#xC0-#xD6]|[#xD8-#xF6]|[#xF8-#x2FF]|[#x370-#x37D]|[#x37F-#x1FFF]|[#x200C-#x200D]|[#x2070-#x218F]|[#x2C00-#x2FEF]|[#x3001-#xD7FF]|[#xF900-#xFDCF]|[#xFDF0-#xFFFD]|[#x10000-#xEFFFF])';

    /**
     * Regular expression for characters a XML name may contain, as defined at:
     *
     * http://www.w3.org/TR/REC-xml/#NT-NameChar
     *
     * We exclude the dot (.) from the name, since this one is used to specify
     * classes, just like in CSS. Should not, but may limit the actual usage.
     * Since now no docbook markup element contains a dot.
     */
    const XML_NAME_CHAR      = '(?:[-0-9])';
        // @todo: Integrate: |#xB7|[#x0300-#x036F]|[#x203F-#x2040])';

    /**
     * Whitespace token
     */
    const T_WHITESPACE    = 1;

    /**
     * Comment token
     */
    const T_COMMENT       = 2;

    /**
     * Common addressing element token
     */
    const T_ADDRESS       = 10;

    /**
     * Direct descendant addressing element token
     */
    const T_DESC_ADDRESS  = 11;

    /**
     * Addressing ID token
     */
    const T_ADDRESS_ID    = 12;

    /**
     * Addressing class token
     */
    const T_ADDRESS_CLASS = 13;

    /**
     * Definition "address" token
     */
    const T_DEFINITION    = 14;

    /**
     * Directive start token
     */
    const T_START         = 20;

    /**
     * Directive end token
     */
    const T_END           = 21;

    /**
     * Formatting rule token
     */
    const T_FORMATTING    = 30;

    /**
     * Formatting rule value token
     */
    const T_VALUE         = 31;

    /**
     * End of file token
     */
    const T_EOF           = 40;

    /**
     * Construct parser
     *
     * Creates the regualr expressions for tokenizing the PCSS file.
     *
     * @return void
     */
    public function __construct()
    {
        parent::__construct();

        $xmlName = '(?:' . self::XML_NAME_STARTCHAR . '(?:' . self::XML_NAME_STARTCHAR . '|' . self::XML_NAME_CHAR . ')*)';

        $this->expressions = array(
            array(
                'type'  => self::T_WHITESPACE,
                'match' => '(\\A\\s+)S' ),
            array(
                'type'  => self::T_COMMENT,
                'match' => '(\\A/\\*.*\\*/)SUs' ),
            array(
                'type'  => self::T_COMMENT,
                'match' => '(\\A//.*$)Sm' ),
            array(
                'type'  => self::T_START,
                'match' => '(\\A\\{)S' ),
            array(
                'type'  => self::T_END,
                'match' => '(\\A\\})S' ),
            array(
                'type'  => self::T_FORMATTING,
                'match' => '(\\A(?P<name>[A-Za-z-]+)\\s*:)S',
                'to'    => 'formats' ),
            array(
                'state' => 'formats',
                'type'  => self::T_VALUE,
                'match' => '(\\A"(?P<value>[^"]+)"\\s*;)S',
                'to'    => 'default' ),
            array(
                'state' => 'formats',
                'type'  => self::T_VALUE,
                'match' => '(\\A(?P<value>[^;]+?)\\s*;)S',
                'to'    => 'default' ),
            array(
                'type'  => self::T_ADDRESS,
                'match' => '(\\A' . $xmlName . ')S' ),
            array(
                'type'  => self::T_DESC_ADDRESS,
                'match' => '(\\A>[\\t\\x20]+' . $xmlName . ')S' ),
            array(
                'type'  => self::T_ADDRESS_CLASS,
                'match' => '(\\A\\.[A-Za-z_-]+)S' ),
            array(
                'type'  => self::T_ADDRESS_ID,
                'match' => '(\\A#' . $xmlName . ')S' ),
            array(
                'type'  => self::T_DEFINITION,
                'match' => '(\\A@[A-Za-z_-]+)S' ),
        );
    }

    /**
     * Parse the given file
     *
     * Try to parse the given PCSS file and return the AST containing the file
     * contents.
     *
     * @param string $file
     * @return void
     */
    public function parseFile( $file )
    {
        $this->file = $file;
        $ast = $this->parseString( file_get_contents( $file ) );
        $this->file = null;
        return $ast;
    }

    /**
     * Parse the given file
     *
     * Try to parse the given PCSS string and return the AST containing the
     * string contents.
     *
     * @param string $string
     * @return void
     */
    public function parseString( $string )
    {
        // Normalize line endings
        $string = preg_replace( '(\r\n|\r|\n)', "\n", $string );

        return $this->parse(
            $this->tokenize( $string )
        );
    }

    /**
     * Tokenize the input string
     *
     * Returns an array of arrays representing the tokens.
     *
     * @param string $string
     * @return array
     */
    protected function tokenize( $string )
    {
        $line     = 1;
        $position = 1;
        $tokens   = array();
        $state    = 'default';

        while ( strlen( $string ) )
        {
            foreach ( $this->expressions as $rule )
            {
                if ( ( isset( $rule['state'] ) &&
                       ( $rule['state'] !== $state ) ) ||
                     !preg_match( $rule['match'], $string, $match ) )
                {
                    continue;
                }

                // Remove matched string from input
                $string = substr( $string, strlen( $match[0] ) );

                // Update tokenizer state
                if ( isset( $rule['to'] ) )
                {
                    $state = $rule['to'];
                }

                // Update position in file
                $line     += substr_count( $match[0], "\n" );
                if ( ( $pos = strrpos( $match[0], "\n" ) ) !== false )
                {
                    $position  = strrpos( $match[0], "\n" ) + 1;
                }
                else
                {
                    $position += strlen( $match[0] );
                }

                // Skip irrelevant rules
                if ( in_array( $rule['type'], $this->ignoreTokens ) )
                {
                    continue 2;
                }

                // Add all other rules including their match to the token
                // array
                $tokens[] = array(
                    'type'     => $rule['type'],
                    'line'     => $line,
                    'position' => $position,
                    'match'    => $match,
                );

                continue 2;
            }

            // No matching rule could be found
            return $this->triggerError( E_PARSE,
                "Could not parse string: '" . substr( $string, 0, 20 ) . "' in state: $state.",
                $this->file, $line, $position
            );
        }

        $tokens[] = array(
            'type'     => self::T_EOF,
            'line'     => $line,
            'position' => $position,
            'match'    => null,
        );

        return $tokens;
    }

    /**
     * Read expected from token array
     *
     * Try to read the given token from the token array. If another token is
     * found, a parse error is issued. If the token is found, the token is
     * removed fromt he token array and returned.
     *
     * @param array $types
     * @param array $tokens
     * @return array
     */
    private function read( array $types, array &$tokens )
    {
        $token = array_shift( $tokens );

        if ( !in_array( $token['type'], $types, true ) )
        {
            $names = array();
            foreach ( $types as $type )
            {
                $names[] = $this->tokenNames[$type];
            }

            $this->triggerError( E_PARSE,
                "Expected one of: " . implode( ', ', $names ) . ", found " . $this->tokenNames[$token['type']] . '.',
                $this->file, $token['line'], $token['position']
            );
        }

        return $token;
    }

    /**
     * Parse given token array
     *
     * Parse the given token array, and create an array of directive objects
     * from it, if the token array specifies a valid PCSS file.
     *
     * @param array $tokens
     * @return array
     */
    protected function parse( array $tokens )
    {
        $directives = array();

        $addressTokens = array(
            self::T_ADDRESS,
            self::T_DESC_ADDRESS,
            self::T_ADDRESS_ID,
            self::T_ADDRESS_CLASS,
        );

        while ( count( $tokens ) > 1 )
        {
            // Address should always be followed by a start token
            $formats = array();
            $address = array();
            
            if ( $tokens[0]['type'] === self::T_DEFINITION )
            {
                $addressType  = 'ezcDocumentPcssDeclarationDirective';
                $addressToken = $this->read( array( self::T_DEFINITION ), $tokens );
                $address      = $addressToken['match'][0];
            }
            else
            {
                do {
                    $addressType  = 'ezcDocumentPcssLayoutDirective';
                    $addressToken = $this->read( $addressTokens, $tokens );
                    $address[]    = $addressToken['match'][0];
                }
                while ( $tokens[0]['type'] !== self::T_START );
            }

            $this->read( array( self::T_START ), $tokens );

            while ( $tokens[0]['type'] !== self::T_END )
            {
                $format = $this->read( array( self::T_FORMATTING ), $tokens );
                $value  = $this->read( array( self::T_VALUE ), $tokens );
                $formats[$format['match']['name']] = $value['match']['value'];
            }

            $this->read( array( self::T_END ), $tokens );

            // Create successfully read directive
            $directives[] = new $addressType(
                $address,
                $formats,
                $this->file, $addressToken['line'], $addressToken['position']
            );
        }

        return $directives;
    }
}
?>