public class AvroUtils extends Object
| Modifier and Type | Class and Description |
|---|---|
static class |
AvroUtils.AvroPathFilter |
static class |
AvroUtils.SchemaEntry |
| Modifier and Type | Field and Description |
|---|---|
static String |
FIELD_LOCATION_DELIMITER |
| Constructor and Description |
|---|
AvroUtils() |
| Modifier and Type | Method and Description |
|---|---|
static org.apache.avro.Schema |
addSchemaCreationTime(org.apache.avro.Schema inputSchema,
org.apache.avro.Schema outputSchema) |
static boolean |
checkReaderWriterCompatibility(org.apache.avro.Schema readerSchema,
org.apache.avro.Schema writerSchema,
boolean ignoreNamespace)
Validates that the provided reader schema can be used to decode avro data written with the
provided writer schema.
|
static void |
convertFieldToSchemaWithProps(Map<String,org.codehaus.jackson.JsonNode> fieldProps,
org.apache.avro.Schema targetSchemaObj)
Generate a
Schema object from Schema.Field with Field's properties carried over to the new object. |
static org.apache.avro.generic.GenericRecord |
convertRecordSchema(org.apache.avro.generic.GenericRecord record,
org.apache.avro.Schema newSchema)
Change the schema of an Avro record.
|
static org.apache.avro.generic.GenericRecord |
decorateRecord(org.apache.avro.generic.GenericRecord inputRecord,
Map<String,Object> fieldMap,
org.apache.avro.Schema outputSchema)
Decorate a
GenericRecord with additional fields and make it conform to an extended Schema
It is the caller's responsibility to ensure that the outputSchema is the merge of the inputRecord's schema
and the additional fields. |
static org.apache.avro.Schema |
decorateRecordSchema(org.apache.avro.Schema inputSchema,
List<org.apache.avro.Schema.Field> fieldList)
Decorate the
Schema for a record with additional Schema.Fields. |
static List<org.apache.avro.Schema.Field> |
deepCopySchemaFields(org.apache.avro.Schema readerSchema) |
static org.apache.commons.math3.util.Pair<org.apache.avro.Schema,List<AvroUtils.SchemaEntry>> |
dropRecursiveFields(org.apache.avro.Schema schema)
Drop recursive fields from a Schema.
|
static org.apache.avro.Schema |
getDirectorySchema(org.apache.hadoop.fs.Path directory,
org.apache.hadoop.conf.Configuration conf,
boolean latest)
Get the latest avro schema for a directory
|
static org.apache.avro.Schema |
getDirectorySchema(org.apache.hadoop.fs.Path directory,
org.apache.hadoop.fs.FileSystem fs,
boolean latest)
Get the latest avro schema for a directory
|
static com.google.common.base.Optional<org.apache.avro.Schema.Field> |
getField(org.apache.avro.Schema schema,
String fieldLocation)
Given a GenericRecord, this method will return the field specified by the path parameter.
|
static com.google.common.base.Optional<org.apache.avro.Schema> |
getFieldSchema(org.apache.avro.Schema schema,
String fieldLocation)
Given a GenericRecord, this method will return the schema of the field specified by the path parameter.
|
static com.google.common.base.Optional<Object> |
getFieldValue(org.apache.avro.generic.GenericRecord record,
String fieldLocation)
Given a GenericRecord, this method will return the field specified by the path parameter.
|
static Map<String,Object> |
getMultiFieldValue(org.apache.avro.generic.GenericRecord record,
String fieldLocation) |
static String |
getSchemaCreationTime(org.apache.avro.Schema inputSchema) |
static org.apache.avro.Schema |
getSchemaFromDataFile(org.apache.hadoop.fs.Path dataFile,
org.apache.hadoop.fs.FileSystem fs)
Get Avro schema from an Avro data file.
|
static boolean |
isSchemaRecursive(org.apache.avro.Schema schema,
com.google.common.base.Optional<org.slf4j.Logger> logger)
Check if a schema has recursive fields inside it
|
static org.apache.avro.Schema |
nullifyFieldsForSchemaMerge(org.apache.avro.Schema oldSchema,
org.apache.avro.Schema newSchema)
Merge oldSchema and newSchame.
|
static org.apache.avro.generic.GenericRecord |
overrideNameAndNamespace(org.apache.avro.generic.GenericRecord input,
String nameOverride,
com.google.common.base.Optional<Map<String,String>> namespaceOverride)
Given a generic record, Override the name and namespace of the schema and return a new generic record
|
static org.apache.avro.Schema |
overrideNameAndNamespace(org.apache.avro.Schema input,
String nameOverride,
com.google.common.base.Optional<Map<String,String>> namespaceOverride)
Given a input schema, Override the name and namespace of the schema and return a new schema
|
static org.apache.avro.Schema |
parseSchemaFromFile(org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs)
Parse Avro schema from a schema file.
|
static byte[] |
recordToByteArray(org.apache.avro.generic.GenericRecord record)
Convert a GenericRecord to a byte array.
|
static com.google.common.base.Optional<org.apache.avro.Schema> |
removeUncomparableFields(org.apache.avro.Schema schema)
Remove map, array, enum fields, as well as union fields that contain map, array or enum,
from an Avro schema.
|
static String |
sanitizeSchemaString(String schemaString)
Escaping "\", """, ";" and "'" character in the schema string when it is being used in DDL.
|
static org.apache.hadoop.fs.Path |
serializeAsPath(org.apache.avro.generic.GenericRecord record,
boolean includeFieldNames,
boolean replacePathSeparators)
Serialize a generic record as a relative
Path. |
static org.apache.avro.Schema |
setSchemaCreationTime(org.apache.avro.Schema inputSchema,
String creationTime) |
static org.apache.avro.generic.GenericRecord |
slowDeserializeGenericRecord(byte[] serializedRecord,
org.apache.avro.Schema schema)
Deserialize a
GenericRecord from a byte array. |
static org.apache.avro.Schema |
switchName(org.apache.avro.Schema schema,
String newName)
Copies the input
Schema but changes the schema name. |
static org.apache.avro.Schema |
switchNamespace(org.apache.avro.Schema schema,
Map<String,String> namespaceOverride)
Copies the input
Schema but changes the schema namespace. |
static Map<String,String> |
toStringMap(Object map)
Given a map: key -> value, return a map: key.toString() -> value.toString().
|
static void |
writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite) |
static void |
writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite,
org.apache.hadoop.fs.permission.FsPermission perm) |
static void |
writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.Path tempFilePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite) |
static void |
writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.Path tempFilePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite,
org.apache.hadoop.fs.permission.FsPermission perm)
Write a schema to a file
|
public static final String FIELD_LOCATION_DELIMITER
public static boolean checkReaderWriterCompatibility(org.apache.avro.Schema readerSchema,
org.apache.avro.Schema writerSchema,
boolean ignoreNamespace)
readerSchema - schema to check.writerSchema - schema to check.ignoreNamespace - whether name and namespace should be ignored in validationpublic static org.apache.avro.Schema addSchemaCreationTime(org.apache.avro.Schema inputSchema,
org.apache.avro.Schema outputSchema)
public static String getSchemaCreationTime(org.apache.avro.Schema inputSchema)
public static org.apache.avro.Schema setSchemaCreationTime(org.apache.avro.Schema inputSchema,
String creationTime)
public static List<org.apache.avro.Schema.Field> deepCopySchemaFields(org.apache.avro.Schema readerSchema)
public static void convertFieldToSchemaWithProps(Map<String,org.codehaus.jackson.JsonNode> fieldProps, org.apache.avro.Schema targetSchemaObj)
Schema object from Schema.Field with Field's properties carried over to the new object.
Common use cases for this method is in traversing Schema object into nested level and create Schema
object for non-root level.public static com.google.common.base.Optional<org.apache.avro.Schema> getFieldSchema(org.apache.avro.Schema schema,
String fieldLocation)
schema - is the record to retrieve the schema fromfieldLocation - is the location of the fieldpublic static com.google.common.base.Optional<org.apache.avro.Schema.Field> getField(org.apache.avro.Schema schema,
String fieldLocation)
schema - is the record to retrieve the schema fromfieldLocation - is the location of the fieldpublic static com.google.common.base.Optional<Object> getFieldValue(org.apache.avro.generic.GenericRecord record, String fieldLocation)
record - is the record to retrieve the field fromfieldLocation - is the location of the fieldpublic static Map<String,Object> getMultiFieldValue(org.apache.avro.generic.GenericRecord record, String fieldLocation)
public static Map<String,String> toStringMap(Object map)
Utf8. This method helps to restore the original string map objectmap - a map objectpublic static org.apache.avro.generic.GenericRecord convertRecordSchema(org.apache.avro.generic.GenericRecord record,
org.apache.avro.Schema newSchema)
throws IOException
record - The Avro record whose schema is to be changed.newSchema - The target schema. It must be compatible as reader schema with record.getSchema() as writer schema.IOException - if conversion failed.public static byte[] recordToByteArray(org.apache.avro.generic.GenericRecord record)
throws IOException
IOExceptionpublic static org.apache.avro.Schema getSchemaFromDataFile(org.apache.hadoop.fs.Path dataFile,
org.apache.hadoop.fs.FileSystem fs)
throws IOException
IOExceptionpublic static org.apache.avro.Schema parseSchemaFromFile(org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs)
throws IOException
IOExceptionpublic static void writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite)
throws IOException
IOExceptionpublic static void writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.Path tempFilePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite)
throws IOException
IOExceptionpublic static void writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite,
org.apache.hadoop.fs.permission.FsPermission perm)
throws IOException
IOExceptionpublic static void writeSchemaToFile(org.apache.avro.Schema schema,
org.apache.hadoop.fs.Path filePath,
org.apache.hadoop.fs.Path tempFilePath,
org.apache.hadoop.fs.FileSystem fs,
boolean overwrite,
org.apache.hadoop.fs.permission.FsPermission perm)
throws IOException
schema - the schemafilePath - the target filetempFilePath - if not null then this path is used for a temporary file used to stage the writefs - a FileSystemoverwrite - should any existing target file be overwritten?perm - permissionsIOExceptionpublic static org.apache.avro.Schema getDirectorySchema(org.apache.hadoop.fs.Path directory,
org.apache.hadoop.fs.FileSystem fs,
boolean latest)
throws IOException
directory - the input dir that contains avro filesfs - the FileSystem for the given directory.latest - true to return latest schema, false to return oldest schemaIOExceptionpublic static org.apache.avro.Schema getDirectorySchema(org.apache.hadoop.fs.Path directory,
org.apache.hadoop.conf.Configuration conf,
boolean latest)
throws IOException
directory - the input dir that contains avro filesconf - configurationlatest - true to return latest schema, false to return oldest schemaIOExceptionpublic static org.apache.avro.Schema nullifyFieldsForSchemaMerge(org.apache.avro.Schema oldSchema,
org.apache.avro.Schema newSchema)
oldSchema - newSchema - public static com.google.common.base.Optional<org.apache.avro.Schema> removeUncomparableFields(org.apache.avro.Schema schema)
public static org.apache.avro.Schema switchName(org.apache.avro.Schema schema,
String newName)
Schema but changes the schema name.schema - Schema to copy.newName - name for the copied Schema.Schema that is a copy of schema, but has the name newName.public static org.apache.avro.Schema switchNamespace(org.apache.avro.Schema schema,
Map<String,String> namespaceOverride)
Schema but changes the schema namespace.schema - Schema to copy.namespaceOverride - namespace for the copied Schema.Schema that is a copy of schema, but has the new namespace.public static org.apache.hadoop.fs.Path serializeAsPath(org.apache.avro.generic.GenericRecord record,
boolean includeFieldNames,
boolean replacePathSeparators)
Path. Useful for converting GenericRecord type keys
into file system locations. For example {field1=v1, field2=v2} returns field1=v1/field2=v2 if includeFieldNames
is true, or v1/v2 if it is false. Illegal HDFS tokens such as ':' and '\\' will be replaced with '_'.
Additionally, parameter replacePathSeparators controls whether to replace path separators ('/') with '_'.record - GenericRecord to serialize.includeFieldNames - If true, each token in the path will be of the form key=value, otherwise, only the value
will be included.replacePathSeparators - If true, path separators ('/') in each token will be replaced with '_'.public static String sanitizeSchemaString(String schemaString)
public static org.apache.avro.generic.GenericRecord slowDeserializeGenericRecord(byte[] serializedRecord,
org.apache.avro.Schema schema)
throws IOException
GenericRecord from a byte array. This method is not intended for high performance.IOExceptionpublic static org.apache.avro.Schema decorateRecordSchema(org.apache.avro.Schema inputSchema,
@Nonnull
List<org.apache.avro.Schema.Field> fieldList)
Schema for a record with additional Schema.Fields.inputSchema: - must be a GenericData.Record schema.public static org.apache.avro.generic.GenericRecord decorateRecord(org.apache.avro.generic.GenericRecord inputRecord,
@Nonnull
Map<String,Object> fieldMap,
org.apache.avro.Schema outputSchema)
GenericRecord with additional fields and make it conform to an extended Schema
It is the caller's responsibility to ensure that the outputSchema is the merge of the inputRecord's schema
and the additional fields. The method does not check this for performance reasons, because it is expected to be called in the
critical path of processing a record.
Use decorateRecordSchema(Schema, List) to generate such a Schema before calling this method.inputRecord: - record with data to be copied into the output recordfieldMap: - values can be primitive types or GenericRecords if nestedoutputSchema: - the schema that the decoratedRecord will conform topublic static org.apache.avro.generic.GenericRecord overrideNameAndNamespace(org.apache.avro.generic.GenericRecord input,
String nameOverride,
com.google.common.base.Optional<Map<String,String>> namespaceOverride)
input - input record who's name and namespace need to be overriddennameOverride - new name for the record schemanamespaceOverride - Optional map containing namespace overridespublic static org.apache.avro.Schema overrideNameAndNamespace(org.apache.avro.Schema input,
String nameOverride,
com.google.common.base.Optional<Map<String,String>> namespaceOverride)
input - nameOverride - namespaceOverride - public static boolean isSchemaRecursive(org.apache.avro.Schema schema,
com.google.common.base.Optional<org.slf4j.Logger> logger)
schema - logger - : Optional logger if you want the method to log why it thinks the schema was recursivepublic static org.apache.commons.math3.util.Pair<org.apache.avro.Schema,List<AvroUtils.SchemaEntry>> dropRecursiveFields(org.apache.avro.Schema schema)
schema -