Add support for orc format by MehulBatra · Pull Request #790 · apache/iceberg-python
Hi @Fokko and @HonahX
✅ I have modified the read logic to read the orc file-based iceberg table and wrote an integration test too it is working great.
Would love Some guidance on:
I could find a way to create an orc file-based iceberg table via glue client(by passing the properties with format=orc)
But this is still making parquet data files when I am appending the data ( Is it due to datafile and deletefile logic that they are by default taking parquet file format)
from pyiceberg.catalog import load_catalog
from decimal import Decimal
import pyarrow as pa
catalog = load_catalog("default") #my default catalog is glue
namespace = 'demo_ns'
table_name = 'test_table_dummy_orc_demo'
pylist = [{'decimal_col': Decimal('32768.1'), 'int_col': 1, 'string_col': "demo_one"},
{'decimal_col': Decimal('44456.1'), 'int_col': 2, 'string_col': "demo_two"}]
arrow_schema = pa.schema(
[
pa.field('decimal_col', pa.decimal128(33, 1)),
pa.field('int_col', pa.int32()),
pa.field('string_col', pa.string()),
],
)
arrow_table = pa.Table.from_pylist(pylist, schema=arrow_schema)
new_table = catalog.create_table(
identifier=f'{namespace}.{table_name}',
schema=arrow_schema,
properties={
'format': 'orc'
}
table.append(arrow_table)