/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System.Collections.Generic; using System.Text; using System; namespace Avro { /// /// Collection of static methods for generating the cannonical form of schemas. /// public static class SchemaNormalization { public static long Empty64 = -4513414715797952619; /// /// Parses a schema into the canonical form as defined by Avro spec. /// /// Schema /// Parsing Canonical Form of a schema as defined by Avro spec. public static string ToParsingForm(Schema s) { IDictionary env = new Dictionary(); return Build(env, s, new StringBuilder()).ToString(); } /// /// Returns a fingerprint of a string of bytes. This string is /// presumed to contain a canonical form of a schema. The /// algorithm used to compute the fingerprint is selected by the /// argument fpName. /// /// If fpName equals the string /// "CRC-64-AVRO", then the result of is /// returned in little-endian format. /// /// If fpName equals the string /// "MD5", then the standard MD5 algorithm is used. /// /// If fpName equals the string /// "SHA-256", then the standard SHA-256 algorithm is used. /// /// Otherwise, fpName is /// not recognized and an /// ArgumentException is thrown /// /// Recommended Avro practice dictiates that /// "CRC-64-AVRO" is used for 64-bit fingerprints, /// "MD5" is used for 128-bit fingerprints, and /// "SHA-256" is used for 256-bit fingerprints. /// /// /// Name of the hashing algorithm. /// Data to be hashed. /// Fingerprint public static byte[] Fingerprint(string fpName, byte[] data) { switch (fpName) { case "CRC-64-AVRO": long fp = Fingerprint64(data); byte[] result = new byte[8]; for (int i = 0; i < 8; i++) { result[i] = (byte) fp; fp >>= 8; } return result; case "MD5": var md5 = System.Security.Cryptography.MD5.Create(); return md5.ComputeHash(data); case "SHA-256": var sha256 = System.Security.Cryptography.SHA256.Create(); return sha256.ComputeHash(data); default: throw new ArgumentException(string.Format("Unsupported fingerprint computation algorithm ({0})", fpName)); } } /// /// Returns applied to the parsing canonical form of the supplied schema. /// /// Name of the hashing algorithm. /// Schema to be hashed. /// Fingerprint public static byte[] ParsingFingerprint(string fpName, Schema s) { return Fingerprint(fpName, Encoding.UTF8.GetBytes(ToParsingForm(s))); } /// /// Returns applied to the parsing canonical form of the supplied schema. /// /// Schema to be hashed. /// Fingerprint public static long ParsingFingerprint64(Schema s) { return Fingerprint64(Encoding.UTF8.GetBytes(ToParsingForm(s))); } /// /// Computes the 64-bit Rabin Fingerprint (as recommended in the Avro spec) of a byte string. /// /// Data to be hashed. /// Fingerprint private static long Fingerprint64(byte[] data) { long result = Empty64; foreach (var b in data) { result = ((long)(((ulong)result) >> 8)) ^ Fp64.FpTable[(int) (result ^ b) & 0xff]; } return result; } private static StringBuilder Build(IDictionary env, Schema s, StringBuilder o) { bool firstTime = true; Schema.Type st = s.Tag; switch (st) { case Schema.Type.Union: UnionSchema us = s as UnionSchema; o.Append('['); foreach(Schema b in us.Schemas) { if (!firstTime) { o.Append(","); } else { firstTime = false; } Build(env, b, o); } return o.Append(']'); case Schema.Type.Array: case Schema.Type.Map: o.Append("{\"type\":\"").Append(Schema.GetTypeString(s.Tag)).Append("\""); if (st == Schema.Type.Array) { ArraySchema arraySchema = s as ArraySchema; Build(env, arraySchema.ItemSchema, o.Append(",\"items\":")); } else { MapSchema mapSchema = s as MapSchema; Build(env, mapSchema.ValueSchema, o.Append(",\"values\":")); } return o.Append("}"); case Schema.Type.Enumeration: case Schema.Type.Fixed: case Schema.Type.Record: NamedSchema namedSchema = s as NamedSchema; var name = namedSchema.Fullname; if (env.ContainsKey(name)) { return o.Append(env[name]); } var qname = "\"" + name + "\""; env.Add(name, qname); o.Append("{\"name\":").Append(qname); o.Append(",\"type\":\"").Append(Schema.GetTypeString(s.Tag)).Append("\""); if (st == Schema.Type.Enumeration) { EnumSchema enumSchema = s as EnumSchema; o.Append(",\"symbols\":["); foreach (var enumSymbol in enumSchema.Symbols) { if (!firstTime) { o.Append(","); } else { firstTime = false; } o.Append("\"").Append(enumSymbol).Append("\""); } o.Append("]"); } else if (st == Schema.Type.Fixed) { FixedSchema fixedSchema = s as FixedSchema; o.Append(",\"size\":").Append(fixedSchema.Size.ToString()); } else // st == Schema.Type.Record { RecordSchema recordSchema = s as RecordSchema; o.Append(",\"fields\":["); foreach (var field in recordSchema.Fields) { if (!firstTime) { o.Append(","); } else { firstTime = false; } o.Append("{\"name\":\"").Append(field.Name).Append("\""); Build(env, field.Schema, o.Append(",\"type\":")).Append("}"); } o.Append("]"); } return o.Append("}"); default: //boolean, bytes, double, float, int, long, null, string return o.Append("\"").Append(s.Name).Append("\""); } } private static class Fp64 { private static readonly long[] fpTable = new long[256]; public static long[] FpTable { get { return fpTable; } } static Fp64() { for (int i = 0; i < 256; i++) { long fp = i; for (int j = 0; j < 8; j++) { long mask = -(fp & 1L); fp = ((long) (((ulong) fp) >> 1)) ^ (Empty64 & mask); } FpTable[i] = fp; } } } } }