From ba9eb991f27f0a996a0058e1630d260350c9c5bf Mon Sep 17 00:00:00 2001 From: Regan Koopmans Date: Tue, 17 Feb 2026 16:12:37 +0100 Subject: [PATCH 1/2] feat: support conversion of BigQuery schema to Protobuf Add functionality to convert BigQuery table schemas to protocol buffer descriptors. This enables schema conversion for BigQuery Storage Write API operations. - Add table_schema_to_proto_descriptor function - Support basic types, structs, and range fields - Implement field name sanitization and collision avoidance - Add comprehensive test coverage - Update documentation and changelog --- .../CHANGELOG.md | 7 + .../docs/bigquery_storage_v1/library.rst | 4 + .../cloud/bigquery_storage_v1/__init__.py | 4 +- .../cloud/bigquery_storage_v1/schema.py | 292 +++++++++ .../tests/unit/test_schema.py | 597 ++++++++++++++++++ 5 files changed, 903 insertions(+), 1 deletion(-) create mode 100644 packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/schema.py create mode 100644 packages/google-cloud-bigquery-storage/tests/unit/test_schema.py diff --git a/packages/google-cloud-bigquery-storage/CHANGELOG.md b/packages/google-cloud-bigquery-storage/CHANGELOG.md index 3c6d98ccede7..355c8d18c821 100644 --- a/packages/google-cloud-bigquery-storage/CHANGELOG.md +++ b/packages/google-cloud-bigquery-storage/CHANGELOG.md @@ -4,6 +4,13 @@ [1]: https://pypi.org/project/google-cloud-bigquery-storage/#history +## [2.37.0](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.1...google-cloud-bigquery-storage-v2.37.0) (2026-02-17) + + +### Features + +* support conversion of BigQuery schema to Protobuf ([2711330](https://github.com/googleapis/google-cloud-python/commit/2711330f0a096a2a9d1b02e51081d1af25a37501)) + ## [2.36.1](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.0...google-cloud-bigquery-storage-v2.36.1) (2026-02-12) diff --git a/packages/google-cloud-bigquery-storage/docs/bigquery_storage_v1/library.rst b/packages/google-cloud-bigquery-storage/docs/bigquery_storage_v1/library.rst index acdd71b15526..cbc38fd93956 100644 --- a/packages/google-cloud-bigquery-storage/docs/bigquery_storage_v1/library.rst +++ b/packages/google-cloud-bigquery-storage/docs/bigquery_storage_v1/library.rst @@ -8,3 +8,7 @@ Bigquery Storage v1 API Library .. automodule:: google.cloud.bigquery_storage_v1.reader :members: :inherited-members: + +.. automodule:: google.cloud.bigquery_storage_v1.schema + :members: + :inherited-members: diff --git a/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/__init__.py b/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/__init__.py index 050d120d5d3e..7de93f019287 100644 --- a/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/__init__.py +++ b/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/__init__.py @@ -28,7 +28,7 @@ # this code path once we drop support for Python 3.7 import importlib_metadata as metadata -from google.cloud.bigquery_storage_v1 import client, types +from google.cloud.bigquery_storage_v1 import client, schema, types class BigQueryReadClient(client.BigQueryReadClient): @@ -140,4 +140,6 @@ def _get_version(dependency_name): # google.cloud.bigquery_storage_v1.client "BigQueryReadClient", "BigQueryWriteClient", + # google.cloud.bigquery_storage_v1.schema + "schema", ) diff --git a/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/schema.py b/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/schema.py new file mode 100644 index 000000000000..5a41d1a228d8 --- /dev/null +++ b/packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/schema.py @@ -0,0 +1,292 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for converting BigQuery schemas to Protocol Buffer descriptors. + +This module provides functionality to dynamically generate Protocol Buffer +descriptors from BigQuery table schemas, eliminating the need to manually +create and compile .proto files when using the BigQuery Storage Write API. +""" + +import re +from typing import Dict, List, Tuple + +from google.cloud.bigquery_storage_v1 import types +from google.protobuf import descriptor_pb2 + + +# Mapping from BigQuery types to Protocol Buffer field types +_BQ_TO_PROTO_TYPE_MAP: Dict[types.TableFieldSchema.Type, int] = { + types.TableFieldSchema.Type.STRING: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + types.TableFieldSchema.Type.INT64: descriptor_pb2.FieldDescriptorProto.TYPE_INT64, + types.TableFieldSchema.Type.BOOL: descriptor_pb2.FieldDescriptorProto.TYPE_BOOL, + types.TableFieldSchema.Type.BYTES: descriptor_pb2.FieldDescriptorProto.TYPE_BYTES, + types.TableFieldSchema.Type.DOUBLE: descriptor_pb2.FieldDescriptorProto.TYPE_DOUBLE, + # DATE is represented as days since epoch + types.TableFieldSchema.Type.DATE: descriptor_pb2.FieldDescriptorProto.TYPE_INT32, + # DATETIME is represented as a formatted string + types.TableFieldSchema.Type.DATETIME: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + # TIME is represented as a formatted string + types.TableFieldSchema.Type.TIME: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + # TIMESTAMP is represented as microseconds since epoch + types.TableFieldSchema.Type.TIMESTAMP: descriptor_pb2.FieldDescriptorProto.TYPE_INT64, + # NUMERIC and BIGNUMERIC are represented as strings + types.TableFieldSchema.Type.NUMERIC: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + types.TableFieldSchema.Type.BIGNUMERIC: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + # GEOGRAPHY is represented as WKT string + types.TableFieldSchema.Type.GEOGRAPHY: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + # JSON is represented as a string + types.TableFieldSchema.Type.JSON: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, + # INTERVAL is represented as a string + types.TableFieldSchema.Type.INTERVAL: descriptor_pb2.FieldDescriptorProto.TYPE_STRING, +} + + +def _sanitize_field_name(field_name: str) -> str: + """Sanitize a field name to make it proto-compatible. + + Args: + field_name: The original field name. + + Returns: + The sanitized field name. + """ + # Replace invalid characters with underscores. + sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', field_name) + # If the first character is a digit, prepend an underscore. + if sanitized and sanitized[0].isdigit(): + sanitized = '_' + sanitized + # As a convention, field names are lowercased. + return sanitized.lower() + + +def _get_field_label(mode: types.TableFieldSchema.Mode) -> int: + """Convert BigQuery field mode to Protocol Buffer field label. + + Args: + mode: The BigQuery field mode (NULLABLE, REQUIRED, or REPEATED). + + Returns: + The corresponding Protocol Buffer field label constant. + """ + if mode == types.TableFieldSchema.Mode.REQUIRED: + return descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED + elif mode == types.TableFieldSchema.Mode.REPEATED: + return descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED + else: # NULLABLE or MODE_UNSPECIFIED + return descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL + + +def _convert_bq_field_to_proto_field( + bq_field: types.TableFieldSchema, + field_number: int, + scope: str, +) -> descriptor_pb2.FieldDescriptorProto: + """Convert a BigQuery field to a Protocol Buffer field descriptor. + + Args: + bq_field: The BigQuery field schema. + field_number: The field number (position) in the message. + scope: The scope/type name for nested messages (STRUCT/RANGE). + + Returns: + A FieldDescriptorProto for the field. + """ + field_name = _sanitize_field_name(bq_field.name) + mode = bq_field.mode or types.TableFieldSchema.Mode.NULLABLE + + field_descriptor = descriptor_pb2.FieldDescriptorProto() + field_descriptor.name = field_name + field_descriptor.number = field_number + field_descriptor.label = _get_field_label(mode) + + if bq_field.type_ == types.TableFieldSchema.Type.STRUCT: + field_descriptor.type = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + field_descriptor.type_name = scope + elif bq_field.type_ == types.TableFieldSchema.Type.RANGE: + field_descriptor.type = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + field_descriptor.type_name = scope + else: + proto_type = _BQ_TO_PROTO_TYPE_MAP.get(bq_field.type_) + if proto_type is None: + raise ValueError( + f"Unsupported BigQuery type: {bq_field.type_} for field {bq_field.name}" + ) + field_descriptor.type = proto_type + + return field_descriptor + + +def _convert_bq_table_schema_to_proto_descriptor_impl( + table_schema: types.TableSchema, + scope: str, +) -> Tuple[descriptor_pb2.DescriptorProto, List[descriptor_pb2.DescriptorProto]]: + """Recursively convert BigQuery table schema to proto descriptor. + + Args: + table_schema: The BigQuery table schema. + scope: The current scope for naming nested messages. + + Returns: + A tuple of (descriptor, nested_descriptors): + - descriptor: The DescriptorProto for this level + - nested_descriptors: List of all nested DescriptorProto objects + + Raises: + ValueError: If the schema contains unsupported field types or invalid RANGE fields. + """ + fields = [] + all_nested_descriptors = [] + field_number = 1 + + for bq_field in table_schema.fields: + if bq_field.type_ == types.TableFieldSchema.Type.STRUCT: + # Sanitize the field name for use in scope + scope_name = _sanitize_field_name(bq_field.name) + current_scope = f"{scope}__{scope_name}" + + # Recursively convert nested struct + nested_schema = types.TableSchema(fields=list(bq_field.fields)) + nested_descriptor, deeply_nested = _convert_bq_table_schema_to_proto_descriptor_impl( + nested_schema, current_scope + ) + all_nested_descriptors.append(nested_descriptor) + all_nested_descriptors.extend(deeply_nested) + + # Create field pointing to the nested message + field = _convert_bq_field_to_proto_field(bq_field, field_number, current_scope) + fields.append(field) + + elif bq_field.type_ == types.TableFieldSchema.Type.RANGE: + # Sanitize the field name for use in scope + scope_name = _sanitize_field_name(bq_field.name) + current_scope = f"{scope}__{scope_name}" + + # Validate RANGE element type + if not bq_field.range_element_type or not bq_field.range_element_type.type_: + raise ValueError( + f"RANGE field '{bq_field.name}' is missing range_element_type. " + f"RANGE fields must specify an element type (DATE, DATETIME, or TIMESTAMP)." + ) + + element_type = bq_field.range_element_type.type_ + + # Validate the element type is supported + if element_type not in ( + types.TableFieldSchema.Type.DATE, + types.TableFieldSchema.Type.DATETIME, + types.TableFieldSchema.Type.TIMESTAMP, + ): + raise ValueError( + f"Unsupported element type '{element_type}' for RANGE field '{bq_field.name}'. " + f"Supported types are DATE, DATETIME, and TIMESTAMP." + ) + + # Create RANGE nested message with start and end fields + range_fields = [ + types.TableFieldSchema( + name="start", + type_=element_type, + mode=types.TableFieldSchema.Mode.NULLABLE, + ), + types.TableFieldSchema( + name="end", + type_=element_type, + mode=types.TableFieldSchema.Mode.NULLABLE, + ), + ] + range_schema = types.TableSchema(fields=range_fields) + range_descriptor, _ = _convert_bq_table_schema_to_proto_descriptor_impl( + range_schema, current_scope + ) + all_nested_descriptors.append(range_descriptor) + + # Create field pointing to the RANGE message + field = _convert_bq_field_to_proto_field(bq_field, field_number, current_scope) + fields.append(field) + + else: + # Primitive field + field = _convert_bq_field_to_proto_field(bq_field, field_number, "") + fields.append(field) + + field_number += 1 + + # Create the descriptor for this level + descriptor = descriptor_pb2.DescriptorProto() + descriptor.name = scope + descriptor.field.extend(fields) + + return descriptor, all_nested_descriptors + + +def table_schema_to_proto_descriptor( + table_schema: types.TableSchema, + message_name: str = "root", +) -> descriptor_pb2.DescriptorProto: + """Convert a BigQuery TableSchema to a Protocol Buffer DescriptorProto. + + This function generates a Protocol Buffer descriptor that can be used with + the BigQuery Storage Write API without needing to create and compile .proto + files. The generated descriptor uses proto2 wire format, which is required + by the Write API. + + Args: + table_schema: The BigQuery table schema to convert. + message_name: Optional name for the root message type. Defaults to "root". + + Returns: + A DescriptorProto that can be used with ProtoSchema in the Write API. + + Raises: + ValueError: If the schema contains unsupported field types or invalid RANGE fields. + + Example: + >>> from google.cloud.bigquery_storage_v1 import schema, types + >>> + >>> # Define a BigQuery schema + >>> table_schema = types.TableSchema(fields=[ + ... types.TableFieldSchema( + ... name="id", + ... type_=types.TableFieldSchema.Type.INT64, + ... mode=types.TableFieldSchema.Mode.REQUIRED + ... ), + ... types.TableFieldSchema( + ... name="name", + ... type_=types.TableFieldSchema.Type.STRING + ... ), + ... ]) + >>> + >>> # Convert to proto descriptor + >>> proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + >>> + >>> # Use with Write API + >>> proto_schema = types.ProtoSchema() + >>> proto_schema.proto_descriptor = proto_descriptor + + Note: + For detailed information about BigQuery to Protocol Buffer type mappings, + see: https://cloud.google.com/bigquery/docs/write-api#data_type_conversions + """ + # Convert using scope-based naming + root_descriptor, nested_descriptors = _convert_bq_table_schema_to_proto_descriptor_impl( + table_schema, message_name + ) + + root_descriptor.nested_type.extend(nested_descriptors) + + return root_descriptor + + +__all__ = ("table_schema_to_proto_descriptor",) diff --git a/packages/google-cloud-bigquery-storage/tests/unit/test_schema.py b/packages/google-cloud-bigquery-storage/tests/unit/test_schema.py new file mode 100644 index 000000000000..3a2560e1692a --- /dev/null +++ b/packages/google-cloud-bigquery-storage/tests/unit/test_schema.py @@ -0,0 +1,597 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from google.cloud.bigquery_storage_v1 import schema, types +from google.protobuf import descriptor_pb2 + + +class TestTableSchemaToProtoDescriptor: + """Tests for table_schema_to_proto_descriptor function.""" + + def test_basic_types(self): + """Test conversion of basic BigQuery types to proto types.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="string_col", type_=types.TableFieldSchema.Type.STRING + ), + types.TableFieldSchema( + name="int64_col", type_=types.TableFieldSchema.Type.INT64 + ), + types.TableFieldSchema( + name="bool_col", type_=types.TableFieldSchema.Type.BOOL + ), + types.TableFieldSchema( + name="bytes_col", type_=types.TableFieldSchema.Type.BYTES + ), + types.TableFieldSchema( + name="double_col", type_=types.TableFieldSchema.Type.DOUBLE + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + assert proto_descriptor.name == "root" + assert len(proto_descriptor.field) == 5 + + # Check string field (field names are lowercased) + string_field = proto_descriptor.field[0] + assert string_field.name == "string_col" + assert string_field.number == 1 + assert string_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + assert string_field.label == descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL + + # Check int64 field + int64_field = proto_descriptor.field[1] + assert int64_field.name == "int64_col" + assert int64_field.number == 2 + assert int64_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_INT64 + + # Check bool field + bool_field = proto_descriptor.field[2] + assert bool_field.name == "bool_col" + assert bool_field.number == 3 + assert bool_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_BOOL + + # Check bytes field + bytes_field = proto_descriptor.field[3] + assert bytes_field.name == "bytes_col" + assert bytes_field.number == 4 + assert bytes_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_BYTES + + # Check double field + double_field = proto_descriptor.field[4] + assert double_field.name == "double_col" + assert double_field.number == 5 + assert double_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_DOUBLE + + def test_special_types(self): + """Test conversion of special BigQuery types (DATE, TIMESTAMP, NUMERIC, etc.).""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="date_col", type_=types.TableFieldSchema.Type.DATE + ), + types.TableFieldSchema( + name="datetime_col", type_=types.TableFieldSchema.Type.DATETIME + ), + types.TableFieldSchema( + name="time_col", type_=types.TableFieldSchema.Type.TIME + ), + types.TableFieldSchema( + name="timestamp_col", type_=types.TableFieldSchema.Type.TIMESTAMP + ), + types.TableFieldSchema( + name="numeric_col", type_=types.TableFieldSchema.Type.NUMERIC + ), + types.TableFieldSchema( + name="bignumeric_col", type_=types.TableFieldSchema.Type.BIGNUMERIC + ), + types.TableFieldSchema( + name="geography_col", type_=types.TableFieldSchema.Type.GEOGRAPHY + ), + types.TableFieldSchema( + name="json_col", type_=types.TableFieldSchema.Type.JSON + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # DATE -> INT32 + assert proto_descriptor.field[0].type == descriptor_pb2.FieldDescriptorProto.TYPE_INT32 + + # DATETIME -> STRING + assert proto_descriptor.field[1].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + # TIME -> STRING + assert proto_descriptor.field[2].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + # TIMESTAMP -> INT64 + assert proto_descriptor.field[3].type == descriptor_pb2.FieldDescriptorProto.TYPE_INT64 + + # NUMERIC -> STRING + assert proto_descriptor.field[4].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + # BIGNUMERIC -> STRING + assert proto_descriptor.field[5].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + # GEOGRAPHY -> STRING + assert proto_descriptor.field[6].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + # JSON -> STRING + assert proto_descriptor.field[7].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + + def test_field_modes(self): + """Test conversion of BigQuery field modes to proto labels.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="nullable_col", + type_=types.TableFieldSchema.Type.STRING, + mode=types.TableFieldSchema.Mode.NULLABLE, + ), + types.TableFieldSchema( + name="required_col", + type_=types.TableFieldSchema.Type.STRING, + mode=types.TableFieldSchema.Mode.REQUIRED, + ), + types.TableFieldSchema( + name="repeated_col", + type_=types.TableFieldSchema.Type.STRING, + mode=types.TableFieldSchema.Mode.REPEATED, + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # NULLABLE -> LABEL_OPTIONAL + assert ( + proto_descriptor.field[0].label + == descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL + ) + + # REQUIRED -> LABEL_REQUIRED + assert ( + proto_descriptor.field[1].label + == descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED + ) + + # REPEATED -> LABEL_REPEATED + assert ( + proto_descriptor.field[2].label + == descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED + ) + + def test_struct_field(self): + """Test conversion of STRUCT (nested message) fields with scope-based naming.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="struct_col", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="sub_string", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="sub_int", + type_=types.TableFieldSchema.Type.INT64, + ), + ], + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Check main field (uses scope-based naming) + struct_field = proto_descriptor.field[0] + assert struct_field.name == "struct_col" + assert struct_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + assert struct_field.type_name == "root__struct_col" + + # Check nested type + assert len(proto_descriptor.nested_type) == 1 + nested_type = proto_descriptor.nested_type[0] + assert nested_type.name == "root__struct_col" + assert len(nested_type.field) == 2 + + # Check nested fields + assert nested_type.field[0].name == "sub_string" + assert nested_type.field[0].type == descriptor_pb2.FieldDescriptorProto.TYPE_STRING + assert nested_type.field[1].name == "sub_int" + assert nested_type.field[1].type == descriptor_pb2.FieldDescriptorProto.TYPE_INT64 + + def test_repeated_struct(self): + """Test conversion of repeated STRUCT fields (arrays of structs).""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="struct_list", + type_=types.TableFieldSchema.Type.STRUCT, + mode=types.TableFieldSchema.Mode.REPEATED, + fields=[ + types.TableFieldSchema( + name="item_id", + type_=types.TableFieldSchema.Type.INT64, + ), + ], + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + struct_field = proto_descriptor.field[0] + assert struct_field.label == descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED + assert struct_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + + def test_range_field(self): + """Test conversion of RANGE fields with scope-based naming.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="date_range", + type_=types.TableFieldSchema.Type.RANGE, + range_element_type=types.TableFieldSchema.FieldElementType( + type_=types.TableFieldSchema.Type.DATE + ), + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Check main field (uses scope-based naming) + range_field = proto_descriptor.field[0] + assert range_field.name == "date_range" + assert range_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + assert range_field.type_name == "root__date_range" + + # Check nested Range message + assert len(proto_descriptor.nested_type) == 1 + range_type = proto_descriptor.nested_type[0] + assert range_type.name == "root__date_range" + assert len(range_type.field) == 2 + + # Check start field + start_field = range_type.field[0] + assert start_field.name == "start" + assert start_field.number == 1 + assert start_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_INT32 # DATE -> INT32 + + # Check end field + end_field = range_type.field[1] + assert end_field.name == "end" + assert end_field.number == 2 + assert end_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_INT32 + + def test_deeply_nested_struct(self): + """Test conversion of deeply nested STRUCT fields with hierarchical scope naming.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="outer", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="inner", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="value", + type_=types.TableFieldSchema.Type.STRING, + ), + ], + ), + ], + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Check outer struct + assert len(proto_descriptor.nested_type) >= 1 + # Find the outer nested type + outer_type = next(nt for nt in proto_descriptor.nested_type if nt.name == "root__outer") + assert outer_type.name == "root__outer" + + # Find the inner nested type (should be in root's nested types due to flattening) + inner_type = next(nt for nt in proto_descriptor.nested_type if nt.name == "root__outer__inner") + assert inner_type.name == "root__outer__inner" + assert len(inner_type.field) == 1 + assert inner_type.field[0].name == "value" + + def test_custom_message_name(self): + """Test specifying a custom message name.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="id", type_=types.TableFieldSchema.Type.INT64 + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor( + table_schema, message_name="CustomRow" + ) + + assert proto_descriptor.name == "CustomRow" + + def test_field_numbering(self): + """Test that field numbers are assigned sequentially starting from 1.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="first", type_=types.TableFieldSchema.Type.STRING + ), + types.TableFieldSchema( + name="second", type_=types.TableFieldSchema.Type.INT64 + ), + types.TableFieldSchema( + name="third", type_=types.TableFieldSchema.Type.BOOL + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + assert proto_descriptor.field[0].number == 1 + assert proto_descriptor.field[1].number == 2 + assert proto_descriptor.field[2].number == 3 + + def test_empty_schema(self): + """Test conversion of an empty schema.""" + table_schema = types.TableSchema(fields=[]) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + assert proto_descriptor.name == "root" + assert len(proto_descriptor.field) == 0 + + def test_complex_schema(self): + """Test a complex schema with multiple field types and nesting.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="id", + type_=types.TableFieldSchema.Type.INT64, + mode=types.TableFieldSchema.Mode.REQUIRED, + ), + types.TableFieldSchema( + name="name", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="tags", + type_=types.TableFieldSchema.Type.STRING, + mode=types.TableFieldSchema.Mode.REPEATED, + ), + types.TableFieldSchema( + name="metadata", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="created_at", + type_=types.TableFieldSchema.Type.TIMESTAMP, + ), + types.TableFieldSchema( + name="attributes", + type_=types.TableFieldSchema.Type.JSON, + ), + ], + ), + types.TableFieldSchema( + name="active_period", + type_=types.TableFieldSchema.Type.RANGE, + range_element_type=types.TableFieldSchema.FieldElementType( + type_=types.TableFieldSchema.Type.TIMESTAMP + ), + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Verify overall structure + assert len(proto_descriptor.field) == 5 + assert len(proto_descriptor.nested_type) == 2 # metadata and active_period + + # Verify required field + assert ( + proto_descriptor.field[0].label + == descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED + ) + + # Verify repeated field + assert ( + proto_descriptor.field[2].label + == descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED + ) + + # Verify struct field + metadata_field = proto_descriptor.field[3] + assert metadata_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + + # Verify range field + range_field = proto_descriptor.field[4] + assert range_field.type == descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + + def test_range_without_element_type_raises_error(self): + """Test that RANGE fields without element type raise ValueError.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="incomplete_range", + type_=types.TableFieldSchema.Type.RANGE, + # Missing range_element_type - should raise error + ), + ] + ) + + with pytest.raises(ValueError) as exc_info: + schema.table_schema_to_proto_descriptor(table_schema) + + assert "RANGE field 'incomplete_range' is missing range_element_type" in str( + exc_info.value + ) + + def test_scope_based_naming_avoids_collisions(self): + """Test that scope-based naming naturally avoids collisions.""" + # Even if field names might collide with generated names, scope-based naming prevents issues + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="my_record", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="value", + type_=types.TableFieldSchema.Type.STRING, + ), + ], + ), + types.TableFieldSchema( + name="my_record_struct", # Would collide with suffix-based naming + type_=types.TableFieldSchema.Type.STRING, + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Verify fields are created correctly + assert len(proto_descriptor.field) == 2 + assert proto_descriptor.field[0].name == "my_record" + assert proto_descriptor.field[0].type_name == "root__my_record" + assert proto_descriptor.field[1].name == "my_record_struct" + + # Verify nested type uses scope-based name + assert len(proto_descriptor.nested_type) == 1 + assert proto_descriptor.nested_type[0].name == "root__my_record" + + def test_field_name_sanitization(self): + """Test that field names are sanitized to be proto-compatible.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="field-with-hyphens", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="field with spaces", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="123field", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="field@special#chars", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="ValidField", + type_=types.TableFieldSchema.Type.STRING, + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Hyphens replaced with underscores + assert proto_descriptor.field[0].name == "field_with_hyphens" + + # Spaces replaced with underscores + assert proto_descriptor.field[1].name == "field_with_spaces" + + # Field starting with digit gets prepended underscore + assert proto_descriptor.field[2].name == "_123field" + + # Special characters replaced with underscores + assert proto_descriptor.field[3].name == "field_special_chars" + + # Valid field names are lowercased + assert proto_descriptor.field[4].name == "validfield" + + def test_field_name_sanitization_in_nested_structs(self): + """Test that field name sanitization works in nested STRUCT fields.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="outer-struct", + type_=types.TableFieldSchema.Type.STRUCT, + fields=[ + types.TableFieldSchema( + name="inner-field", + type_=types.TableFieldSchema.Type.STRING, + ), + types.TableFieldSchema( + name="123inner", + type_=types.TableFieldSchema.Type.INT64, + ), + ], + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Outer struct field name sanitized + outer_field = proto_descriptor.field[0] + assert outer_field.name == "outer_struct" + assert outer_field.type_name == "root__outer_struct" + + # Nested type name sanitized + nested_type = proto_descriptor.nested_type[0] + assert nested_type.name == "root__outer_struct" + + # Inner fields sanitized + assert nested_type.field[0].name == "inner_field" + assert nested_type.field[1].name == "_123inner" + + def test_field_name_sanitization_in_range_fields(self): + """Test that field name sanitization works for RANGE fields.""" + table_schema = types.TableSchema( + fields=[ + types.TableFieldSchema( + name="date-range", + type_=types.TableFieldSchema.Type.RANGE, + range_element_type=types.TableFieldSchema.FieldElementType( + type_=types.TableFieldSchema.Type.DATE + ), + ), + ] + ) + + proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema) + + # Range field name sanitized + range_field = proto_descriptor.field[0] + assert range_field.name == "date_range" + assert range_field.type_name == "root__date_range" + + # Range type name sanitized + range_type = proto_descriptor.nested_type[0] + assert range_type.name == "root__date_range" + + +if __name__ == "__main__": + pytest.main([__file__]) From 8cf39e7c99ee45a89fc88685f387b499592091cf Mon Sep 17 00:00:00 2001 From: Regan Koopmans Date: Wed, 18 Feb 2026 09:30:07 +0100 Subject: [PATCH 2/2] doc: undo changelog changes --- packages/google-cloud-bigquery-storage/CHANGELOG.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/packages/google-cloud-bigquery-storage/CHANGELOG.md b/packages/google-cloud-bigquery-storage/CHANGELOG.md index 355c8d18c821..3c6d98ccede7 100644 --- a/packages/google-cloud-bigquery-storage/CHANGELOG.md +++ b/packages/google-cloud-bigquery-storage/CHANGELOG.md @@ -4,13 +4,6 @@ [1]: https://pypi.org/project/google-cloud-bigquery-storage/#history -## [2.37.0](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.1...google-cloud-bigquery-storage-v2.37.0) (2026-02-17) - - -### Features - -* support conversion of BigQuery schema to Protobuf ([2711330](https://github.com/googleapis/google-cloud-python/commit/2711330f0a096a2a9d1b02e51081d1af25a37501)) - ## [2.36.1](https://github.com/googleapis/google-cloud-python/compare/google-cloud-bigquery-storage-v2.36.0...google-cloud-bigquery-storage-v2.36.1) (2026-02-12)