From 95a77b401f326f58db39ea7792d4da4f0a738c59 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 11 Nov 2025 10:18:32 -0800 Subject: [PATCH 1/4] add options to control page encoding stats reading --- parquet/benches/metadata.rs | 30 +++++++ parquet/src/arrow/arrow_reader/mod.rs | 90 +++++++++++++++++++ parquet/src/basic.rs | 15 ++++ parquet/src/file/metadata/mod.rs | 55 ++++++++++-- parquet/src/file/metadata/options.rs | 89 +++++++++++++++++- .../src/file/metadata/thrift/encryption.rs | 5 +- parquet/src/file/metadata/thrift/mod.rs | 52 +++++++++-- parquet/src/file/serialized_reader.rs | 82 +++++++++++++++++ 8 files changed, 395 insertions(+), 23 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 43b08e6b26a4..409b79e0f050 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -173,6 +173,22 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); + c.bench_function("decode metadata with stats mask", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) + .unwrap(); + }) + }); + + let options = ParquetMetaDataOptions::new().with_skip_encoding_stats(true); + c.bench_function("decode metadata with skip PES", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) + .unwrap(); + }) + }); + let buf: Bytes = black_box(encoded_meta()).into(); c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { @@ -187,6 +203,20 @@ fn criterion_benchmark(c: &mut Criterion) { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); + + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); + c.bench_function("decode metadata (wide) with stats mask", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + }) + }); + + let options = ParquetMetaDataOptions::new().with_skip_encoding_stats(true); + c.bench_function("decode metadata (wide) with skip PES", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 673e9d9d7fa6..5caf3687b2c4 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -537,6 +537,30 @@ impl ArrowReaderOptions { self } + /// Set whether to convert `encoding_stats` to a bitmask. + /// + /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this + /// might be desirable. + /// + /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: + /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self { + self.metadata_options.set_encoding_stats_as_mask(val); + self + } + + /// Set whether to skip decoding `encoding_stats`. + pub fn with_skip_encoding_stats(mut self, val: bool) -> Self { + self.metadata_options.set_skip_encoding_stats(val); + self + } + + /// Provide a list of column indicies for which to decode `encoding_stats`. + pub fn with_keep_encoding_stats(mut self, keep: &[usize]) -> Self { + self.metadata_options.set_keep_encoding_stats(keep); + self + } + /// Provide the file decryption properties to use when reading encrypted parquet files. /// /// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided. @@ -1282,6 +1306,72 @@ mod tests { assert_eq!(expected.as_ref(), builder.metadata.as_ref()); } + #[test] + fn test_page_encoding_stats_mask() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_tiny_pages.parquet"); + let file = File::open(path).unwrap(); + + let arrow_options = ArrowReaderOptions::new().with_encoding_stats_as_mask(true); + let builder = + ParquetRecordBatchReaderBuilder::try_new_with_options(file, arrow_options).unwrap(); + + let row_group_metadata = builder.metadata.row_group(0); + + // test page encoding stats + let page_encoding_stats = row_group_metadata + .column(0) + .page_encoding_stats_mask() + .unwrap(); + assert!(page_encoding_stats.is_set(Encoding::PLAIN)); + // PLAIN = 0, so 1 << 0 or 1 + assert_eq!(page_encoding_stats.as_i32() ^ 1, 0); + let page_encoding_stats = row_group_metadata + .column(2) + .page_encoding_stats_mask() + .unwrap(); + assert!(page_encoding_stats.is_set(Encoding::PLAIN_DICTIONARY)); + // PLAIN_DICTIONARY = 2, so 1 << 2 + assert_eq!(page_encoding_stats.as_i32() ^ (1 << 2), 0); + } + + #[test] + fn test_page_encoding_stats_skipped() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_tiny_pages.parquet"); + let file = File::open(path).unwrap(); + + // test skipping all + let arrow_options = ArrowReaderOptions::new().with_skip_encoding_stats(true); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + arrow_options, + ) + .unwrap(); + + let row_group_metadata = builder.metadata.row_group(0); + for column in row_group_metadata.columns() { + assert!(column.page_encoding_stats().is_none()); + assert!(column.page_encoding_stats_mask().is_none()); + } + + // test skipping all but one column and converting to mask + let arrow_options = ArrowReaderOptions::new() + .with_encoding_stats_as_mask(true) + .with_keep_encoding_stats(&[0]); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + arrow_options, + ) + .unwrap(); + + let row_group_metadata = builder.metadata.row_group(0); + for (idx, column) in row_group_metadata.columns().iter().enumerate() { + assert!(column.page_encoding_stats().is_none()); + assert_eq!(column.page_encoding_stats_mask().is_some(), idx == 0); + } + } + #[test] fn test_arrow_reader_single_column() { let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet"); diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index eaa889bb99f1..2e9c5978b43b 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -737,6 +737,11 @@ impl EncodingMask { self.0 & (1 << (val as i32)) != 0 } + /// Test if this mask has only the bit for the given [`Encoding`] set. + pub fn is_only(&self, val: Encoding) -> bool { + self.0 == (1 << (val as i32)) + } + /// Test if all [`Encoding`]s in a given set are present in this mask. pub fn all_set<'a>(&self, mut encodings: impl Iterator) -> bool { encodings.all(|&e| self.is_set(e)) @@ -2498,4 +2503,14 @@ mod tests { "Parquet error: Attempt to create invalid mask: 0x2" ); } + + #[test] + fn test_encoding_mask_is_only() { + let mask = EncodingMask::new_from_encodings([Encoding::PLAIN].iter()); + assert!(mask.is_only(Encoding::PLAIN)); + + let mask = + EncodingMask::new_from_encodings([Encoding::PLAIN, Encoding::PLAIN_DICTIONARY].iter()); + assert!(!mask.is_only(Encoding::PLAIN)); + } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 45b69a66799f..d89d4b0491a4 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -813,6 +813,7 @@ pub struct ColumnChunkMetaData { statistics: Option, geo_statistics: Option>, encoding_stats: Option>, + encoding_stats_mask: Option, bloom_filter_offset: Option, bloom_filter_length: Option, offset_index_offset: Option, @@ -1050,12 +1051,43 @@ impl ColumnChunkMetaData { self.geo_statistics.as_deref() } - /// Returns the offset for the page encoding stats, - /// or `None` if no page encoding stats are available. + /// Returns the page encoding statistics, or `None` if no page encoding statistics + /// are available. pub fn page_encoding_stats(&self) -> Option<&Vec> { self.encoding_stats.as_ref() } + /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are + /// not available. + /// + /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to + /// enable fast determination of whether all pages in a column chunk are dictionary encoded + /// (see ). + /// Decoding the full page encoding statistics, however, can be very costly, and is not + /// necessary to support the aforementioned use case. As an alternative, this crate can + /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings + /// used for data pages + /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]). + /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way: + /// + /// ```rust + /// use parquet::basic::Encoding; + /// use parquet::file::metadata::ColumnChunkMetaData; + /// // test if all data pages in the column chunk are dictionary encoded + /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool { + /// // check that dictionary encoding was used + /// col_meta.dictionary_page_offset().is_some() + /// && col_meta.page_encoding_stats_mask().is_some_and(|mask| { + /// // mask should only have one bit set, either for PLAIN_DICTIONARY or + /// // RLE_DICTIONARY + /// mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY) + /// }) + /// } + /// ``` + pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> { + self.encoding_stats_mask.as_ref() + } + /// Returns the offset for the bloom filter. pub fn bloom_filter_offset(&self) -> Option { self.bloom_filter_offset @@ -1178,6 +1210,7 @@ impl ColumnChunkMetaDataBuilder { statistics: None, geo_statistics: None, encoding_stats: None, + encoding_stats_mask: None, bloom_filter_offset: None, bloom_filter_length: None, offset_index_offset: None, @@ -1278,6 +1311,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets page encoding stats mask for this column chunk. + pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self { + self.0.encoding_stats_mask = Some(value); + self + } + /// Clears the page encoding stats for this column chunk. pub fn clear_page_encoding_stats(mut self) -> Self { self.0.encoding_stats = None; @@ -1882,9 +1921,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let base_expected_size = 2766; + let base_expected_size = 2798; #[cfg(feature = "encryption")] - let base_expected_size = 2934; + let base_expected_size = 2966; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1913,9 +1952,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 3192; + let bigger_expected_size = 3224; #[cfg(feature = "encryption")] - let bigger_expected_size = 3360; + let bigger_expected_size = 3392; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); @@ -1962,7 +2001,7 @@ mod tests { .set_row_groups(row_group_meta.clone()) .build(); - let base_expected_size = 2058; + let base_expected_size = 2074; assert_eq!(parquet_meta_data.memory_size(), base_expected_size); let footer_key = "0123456789012345".as_bytes(); @@ -1988,7 +2027,7 @@ mod tests { .set_file_decryptor(Some(decryptor)) .build(); - let expected_size_with_decryptor = 3072; + let expected_size_with_decryptor = 3088; assert!(expected_size_with_decryptor > base_expected_size); assert_eq!( diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index bbc5314d3ac7..9e12a6d45261 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -17,6 +17,10 @@ //! Options used to control metadata parsing +use paste::paste; +use std::collections::HashSet; +use std::sync::Arc; + use crate::schema::types::SchemaDescPtr; /// Options that can be set to control what parts of the Parquet file footer @@ -29,6 +33,24 @@ use crate::schema::types::SchemaDescPtr; #[derive(Default, Debug, Clone)] pub struct ParquetMetaDataOptions { schema_descr: Option, + encoding_stats_as_mask: bool, + // The outer option acts as a global boolean, so if `skip_encoding_stats.is_some()` + // is `true` then we're at least skipping some stats. The inner `Option` is a keep + // list of column indicies to decode. + skip_encoding_stats: Option>>>, +} + +// wraps `set_X` with a `with_X` function that returns `Self` +macro_rules! add_mutator { + ($name:expr, $type:ty) => { + paste! { + #[doc = concat!("Call [`Self::set_", stringify!($name), "`] and return `Self` for chaining.")] + pub fn [](mut self, val: $type) -> Self { + self.[](val); + self + } + } + } } impl ParquetMetaDataOptions { @@ -48,11 +70,70 @@ impl ParquetMetaDataOptions { self.schema_descr = Some(val); } - /// Provide a schema to use when decoding the metadata. Returns `Self` for chaining. - pub fn with_schema(mut self, val: SchemaDescPtr) -> Self { - self.schema_descr = Some(val); - self + // with_schema + add_mutator!(schema, SchemaDescPtr); + + /// Returns whether to present the `encoding_stats` field of the `ColumnMetaData` as a + /// bitmask. + /// + /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this + /// might be desirable. + /// + /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: + /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + pub fn encoding_stats_as_mask(&self) -> bool { + self.encoding_stats_as_mask + } + + /// Convert `encoding_stats` from a vector of [`PageEncodingStats`] to a bitmask. This can + /// speed up metadata decoding while still enabling some use cases served by the full stats. + /// + /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information. + /// + /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats + /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: + /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + pub fn set_encoding_stats_as_mask(&mut self, val: bool) { + self.encoding_stats_as_mask = val; } + + // with_encoding_stats_as_mask + add_mutator!(encoding_stats_as_mask, bool); + + /// Returns whether to skip decoding the `encoding_stats` in the `ColumnMetaData` + /// for the column indexed by `col_index`. + pub fn skip_encoding_stats(&self, col_index: usize) -> bool { + self.skip_encoding_stats + .as_ref() + .is_some_and(|oset| oset.as_ref().is_none_or(|keep| !keep.contains(&col_index))) + } + + /// Skip decoding of all `encoding_stats`. Takes precedence over + /// [`Self::encoding_stats_as_mask`]. + pub fn set_skip_encoding_stats(&mut self, val: bool) { + self.skip_encoding_stats = if val { Some(None) } else { None }; + } + + // with_skip_encoding_stats + add_mutator!(skip_encoding_stats, bool); + + /// Skip decoding of `encoding_stats`, but decode the stats for those columns in + /// the provided list of column indices. + /// + /// This allows for optimizations such as only decoding the page encoding statistics + /// for columns present in a predicate. + pub fn set_keep_encoding_stats(&mut self, keep: &[usize]) { + if keep.is_empty() { + self.set_skip_encoding_stats(true); + } else { + let mut keep_set = HashSet::::with_capacity(keep.len()); + keep_set.extend(keep.iter()); + self.skip_encoding_stats = Some(Some(Arc::new(keep_set))) + } + } + + // with_keep_encoding_stats + add_mutator!(keep_encoding_stats, &[usize]); } #[cfg(test)] diff --git a/parquet/src/file/metadata/thrift/encryption.rs b/parquet/src/file/metadata/thrift/encryption.rs index 56c5a6a4b9da..9713cf936dd2 100644 --- a/parquet/src/file/metadata/thrift/encryption.rs +++ b/parquet/src/file/metadata/thrift/encryption.rs @@ -113,6 +113,7 @@ pub(crate) struct FileCryptoMetaData<'a> { fn row_group_from_encrypted_thrift( mut rg: RowGroupMetaData, decryptor: Option<&FileDecryptor>, + options: Option<&ParquetMetaDataOptions>, ) -> Result { let schema_descr = rg.schema_descr; @@ -176,7 +177,7 @@ fn row_group_from_encrypted_thrift( // parse decrypted buffer and then replace fields in 'c' let mut prot = ThriftSliceInputProtocol::new(&decrypted_cc_buf); - let mask = read_column_metadata(&mut prot, &mut c)?; + let mask = read_column_metadata(&mut prot, &mut c, i, options)?; validate_column_metadata(mask)?; columns.push(c); @@ -297,7 +298,7 @@ pub(crate) fn parquet_metadata_with_encryption( // decrypt column chunk info let row_groups = row_groups .into_iter() - .map(|rg| row_group_from_encrypted_thrift(rg, file_decryptor.as_ref())) + .map(|rg| row_group_from_encrypted_thrift(rg, file_decryptor.as_ref(), options)) .collect::>>()?; let metadata = ParquetMetaDataBuilder::new(file_metadata) diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 175a152839b4..96c49330d941 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -382,15 +382,41 @@ fn validate_column_metadata(mask: u16) -> Result<()> { Ok(()) } +fn read_encoding_stats_as_mask<'a>( + prot: &mut ThriftSliceInputProtocol<'a>, +) -> Result { + // read the vector of stats, setting mask bits for data pages + let mut mask = 0i32; + let list_ident = prot.read_list_begin()?; + for _ in 0..list_ident.size { + let pes = PageEncodingStats::read_thrift(prot)?; + match pes.page_type { + PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => mask |= 1 << pes.encoding as i32, + _ => {} + } + } + EncodingMask::try_new(mask) +} + // Decode `ColumnMetaData`. Returns a mask of all required fields that were observed. // This mask can be passed to `validate_column_metadata`. fn read_column_metadata<'a>( prot: &mut ThriftSliceInputProtocol<'a>, column: &mut ColumnChunkMetaData, + col_index: usize, + options: Option<&ParquetMetaDataOptions>, ) -> Result { // mask for seen required fields in ColumnMetaData let mut seen_mask = 0u16; + let mut skip_pes = false; + let mut pes_mask = false; + + if let Some(opts) = options { + skip_pes = opts.skip_encoding_stats(col_index); + pes_mask = opts.encoding_stats_as_mask(); + } + // struct ColumnMetaData { // 1: required Type type // 2: required list encodings @@ -461,10 +487,15 @@ fn read_column_metadata<'a>( column.statistics = convert_stats(column_descr, Some(Statistics::read_thrift(&mut *prot)?))?; } - 13 => { - let val = - read_thrift_vec::(&mut *prot)?; - column.encoding_stats = Some(val); + 13 if !skip_pes => { + if pes_mask { + let val = read_encoding_stats_as_mask(&mut *prot)?; + column.encoding_stats_mask = Some(val); + } else { + let val = + read_thrift_vec::(&mut *prot)?; + column.encoding_stats = Some(val); + } } 14 => { column.bloom_filter_offset = Some(i64::read_thrift(&mut *prot)?); @@ -499,6 +530,8 @@ fn read_column_metadata<'a>( fn read_column_chunk<'a>( prot: &mut ThriftSliceInputProtocol<'a>, column_descr: &Arc, + col_index: usize, + options: Option<&ParquetMetaDataOptions>, ) -> Result { // create a default initialized ColumnMetaData let mut col = ColumnChunkMetaDataBuilder::new(column_descr.clone()).build()?; @@ -535,7 +568,7 @@ fn read_column_chunk<'a>( has_file_offset = true; } 3 => { - col_meta_mask = read_column_metadata(&mut *prot, &mut col)?; + col_meta_mask = read_column_metadata(&mut *prot, &mut col, col_index, options)?; } 4 => { col.offset_index_offset = Some(i64::read_thrift(&mut *prot)?); @@ -585,6 +618,7 @@ fn read_column_chunk<'a>( fn read_row_group( prot: &mut ThriftSliceInputProtocol, schema_descr: &Arc, + options: Option<&ParquetMetaDataOptions>, ) -> Result { // create default initialized RowGroupMetaData let mut row_group = RowGroupMetaDataBuilder::new(schema_descr.clone()).build_unchecked(); @@ -623,7 +657,7 @@ fn read_row_group( )); } for i in 0..list_ident.size as usize { - let col = read_column_chunk(prot, &schema_descr.columns()[i])?; + let col = read_column_chunk(prot, &schema_descr.columns()[i], i, options)?; row_group.columns.push(col); } mask |= RG_COLUMNS; @@ -766,7 +800,7 @@ pub(crate) fn parquet_metadata_from_bytes( let list_ident = prot.read_list_begin()?; let mut rg_vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - rg_vec.push(read_row_group(&mut prot, schema_descr)?); + rg_vec.push(read_row_group(&mut prot, schema_descr, options)?); } row_groups = Some(rg_vec); } @@ -1625,7 +1659,7 @@ pub(crate) mod tests { schema_descr: Arc, ) -> Result { let mut reader = ThriftSliceInputProtocol::new(buf); - crate::file::metadata::thrift::read_row_group(&mut reader, &schema_descr) + crate::file::metadata::thrift::read_row_group(&mut reader, &schema_descr, None) } pub(crate) fn read_column_chunk( @@ -1633,7 +1667,7 @@ pub(crate) mod tests { column_descr: Arc, ) -> Result { let mut reader = ThriftSliceInputProtocol::new(buf); - crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr) + crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, None) } pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 990b2f4f1699..b64eb22ae539 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -160,6 +160,27 @@ impl ReadOptionsBuilder { self } + /// Set whether to convert `encoding_stats` to a bitmask. + /// + /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this + /// might be desirable. + pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self { + self.metadata_options.set_encoding_stats_as_mask(val); + self + } + + /// Set whether to skip decoding `encoding_stats`. + pub fn with_skip_encoding_stats(mut self, val: bool) -> Self { + self.metadata_options.set_skip_encoding_stats(val); + self + } + + /// Provide a list of column indicies for which to decode `encoding_stats`. + pub fn with_keep_encoding_stats(mut self, keep: &[usize]) -> Self { + self.metadata_options.set_keep_encoding_stats(keep); + self + } + /// Seal the builder and return the read options pub fn build(self) -> ReadOptions { let props = self @@ -1857,6 +1878,67 @@ mod tests { assert_eq!(col0_metadata.offset_index_length().unwrap(), 11); } + #[test] + fn test_file_reader_page_stats_mask() { + let file = get_test_file("alltypes_tiny_pages.parquet"); + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(true) + .build(); + let file_reader = Arc::new(SerializedFileReader::new_with_options(file, options).unwrap()); + + let row_group_metadata = file_reader.metadata.row_group(0); + + // test page encoding stats + let page_encoding_stats = row_group_metadata + .column(0) + .page_encoding_stats_mask() + .unwrap(); + assert!(page_encoding_stats.is_set(Encoding::PLAIN)); + // PLAIN = 0, so 1 << 0 or 1 + assert_eq!(page_encoding_stats.as_i32() ^ 1, 0); + let page_encoding_stats = row_group_metadata + .column(2) + .page_encoding_stats_mask() + .unwrap(); + assert!(page_encoding_stats.is_set(Encoding::PLAIN_DICTIONARY)); + // PLAIN_DICTIONARY = 2, so 1 << 2 + assert_eq!(page_encoding_stats.as_i32() ^ (1 << 2), 0); + } + + #[test] + fn test_file_reader_page_stats_skipped() { + let file = get_test_file("alltypes_tiny_pages.parquet"); + + // test skipping all + let options = ReadOptionsBuilder::new() + .with_skip_encoding_stats(true) + .build(); + let file_reader = Arc::new( + SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), + ); + + let row_group_metadata = file_reader.metadata.row_group(0); + for column in row_group_metadata.columns() { + assert!(column.page_encoding_stats().is_none()); + assert!(column.page_encoding_stats_mask().is_none()); + } + + // test skipping all but one column + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(true) + .with_keep_encoding_stats(&[0]) + .build(); + let file_reader = Arc::new( + SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), + ); + + let row_group_metadata = file_reader.metadata.row_group(0); + for (idx, column) in row_group_metadata.columns().iter().enumerate() { + assert!(column.page_encoding_stats().is_none()); + assert_eq!(column.page_encoding_stats_mask().is_some(), idx == 0); + } + } + #[test] fn test_file_reader_with_no_filter() -> Result<()> { let test_file = get_test_file("alltypes_plain.parquet"); From e545319fc4617049a96e8204f89cacea34ec2d3e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 11 Nov 2025 13:21:28 -0800 Subject: [PATCH 2/4] typo --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/file/metadata/options.rs | 2 +- parquet/src/file/serialized_reader.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5caf3687b2c4..88ae656d7ff4 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -555,7 +555,7 @@ impl ArrowReaderOptions { self } - /// Provide a list of column indicies for which to decode `encoding_stats`. + /// Provide a list of column indices for which to decode `encoding_stats`. pub fn with_keep_encoding_stats(mut self, keep: &[usize]) -> Self { self.metadata_options.set_keep_encoding_stats(keep); self diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index 9e12a6d45261..5573ce8b35e5 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -36,7 +36,7 @@ pub struct ParquetMetaDataOptions { encoding_stats_as_mask: bool, // The outer option acts as a global boolean, so if `skip_encoding_stats.is_some()` // is `true` then we're at least skipping some stats. The inner `Option` is a keep - // list of column indicies to decode. + // list of column indices to decode. skip_encoding_stats: Option>>>, } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index b64eb22ae539..3e1dce690c9b 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -175,7 +175,7 @@ impl ReadOptionsBuilder { self } - /// Provide a list of column indicies for which to decode `encoding_stats`. + /// Provide a list of column indices for which to decode `encoding_stats`. pub fn with_keep_encoding_stats(mut self, keep: &[usize]) -> Self { self.metadata_options.set_keep_encoding_stats(keep); self From c096b200e2ced263ab2f66e46fc927e4ea27e701 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 11 Nov 2025 13:25:39 -0800 Subject: [PATCH 3/4] rework tests --- parquet/src/arrow/arrow_reader/mod.rs | 8 ++------ parquet/src/file/serialized_reader.rs | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 88ae656d7ff4..0d4b44b655b3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1323,16 +1323,12 @@ mod tests { .column(0) .page_encoding_stats_mask() .unwrap(); - assert!(page_encoding_stats.is_set(Encoding::PLAIN)); - // PLAIN = 0, so 1 << 0 or 1 - assert_eq!(page_encoding_stats.as_i32() ^ 1, 0); + assert!(page_encoding_stats.is_only(Encoding::PLAIN)); let page_encoding_stats = row_group_metadata .column(2) .page_encoding_stats_mask() .unwrap(); - assert!(page_encoding_stats.is_set(Encoding::PLAIN_DICTIONARY)); - // PLAIN_DICTIONARY = 2, so 1 << 2 - assert_eq!(page_encoding_stats.as_i32() ^ (1 << 2), 0); + assert!(page_encoding_stats.is_only(Encoding::PLAIN_DICTIONARY)); } #[test] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 3e1dce690c9b..5a27ef9a0e88 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1893,16 +1893,12 @@ mod tests { .column(0) .page_encoding_stats_mask() .unwrap(); - assert!(page_encoding_stats.is_set(Encoding::PLAIN)); - // PLAIN = 0, so 1 << 0 or 1 - assert_eq!(page_encoding_stats.as_i32() ^ 1, 0); + assert!(page_encoding_stats.is_only(Encoding::PLAIN)); let page_encoding_stats = row_group_metadata .column(2) .page_encoding_stats_mask() .unwrap(); - assert!(page_encoding_stats.is_set(Encoding::PLAIN_DICTIONARY)); - // PLAIN_DICTIONARY = 2, so 1 << 2 - assert_eq!(page_encoding_stats.as_i32() ^ (1 << 2), 0); + assert!(page_encoding_stats.is_only(Encoding::PLAIN_DICTIONARY)); } #[test] From 5aaa8e3dbc4b9ac4d83a8ffcfb4afb4bf724d76e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 12 Nov 2025 10:02:08 -0800 Subject: [PATCH 4/4] improve docs --- parquet/src/arrow/arrow_reader/mod.rs | 9 +++++++-- parquet/src/file/metadata/options.rs | 29 +++++++++++++++++++-------- parquet/src/file/serialized_reader.rs | 10 +++++++-- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 0d4b44b655b3..5f1ebd383936 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -537,19 +537,24 @@ impl ArrowReaderOptions { self } - /// Set whether to convert `encoding_stats` to a bitmask. + /// Set whether to convert the [`encoding_stats`] in the Parquet `ColumnMetaData` to a bitmask. /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this /// might be desirable. /// /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self { self.metadata_options.set_encoding_stats_as_mask(val); self } - /// Set whether to skip decoding `encoding_stats`. + /// Set whether to skip decoding the [`encoding_stats`] field of the Parquet `ColumnMetaData`. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn with_skip_encoding_stats(mut self, val: bool) -> Self { self.metadata_options.set_skip_encoding_stats(val); self diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index 5573ce8b35e5..1369cecf1f09 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -73,19 +73,21 @@ impl ParquetMetaDataOptions { // with_schema add_mutator!(schema, SchemaDescPtr); - /// Returns whether to present the `encoding_stats` field of the `ColumnMetaData` as a - /// bitmask. + /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData` + /// as a bitmask. /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this /// might be desirable. /// /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn encoding_stats_as_mask(&self) -> bool { self.encoding_stats_as_mask } - /// Convert `encoding_stats` from a vector of [`PageEncodingStats`] to a bitmask. This can + /// Convert [`encoding_stats`] from a vector of [`PageEncodingStats`] to a bitmask. This can /// speed up metadata decoding while still enabling some use cases served by the full stats. /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information. @@ -93,6 +95,8 @@ impl ParquetMetaDataOptions { /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats /// [`ColumnChunkMetaData::page_encoding_stats_mask`]: /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn set_encoding_stats_as_mask(&mut self, val: bool) { self.encoding_stats_as_mask = val; } @@ -100,16 +104,22 @@ impl ParquetMetaDataOptions { // with_encoding_stats_as_mask add_mutator!(encoding_stats_as_mask, bool); - /// Returns whether to skip decoding the `encoding_stats` in the `ColumnMetaData` + /// Returns whether to skip decoding the [`encoding_stats`] in the Parquet `ColumnMetaData` /// for the column indexed by `col_index`. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn skip_encoding_stats(&self, col_index: usize) -> bool { self.skip_encoding_stats .as_ref() .is_some_and(|oset| oset.as_ref().is_none_or(|keep| !keep.contains(&col_index))) } - /// Skip decoding of all `encoding_stats`. Takes precedence over - /// [`Self::encoding_stats_as_mask`]. + /// Sets whether to skip decoding of all [`encoding_stats`] in the Parquet `ColumnMetaData`. + /// Takes precedence over [`Self::encoding_stats_as_mask`]. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn set_skip_encoding_stats(&mut self, val: bool) { self.skip_encoding_stats = if val { Some(None) } else { None }; } @@ -117,11 +127,14 @@ impl ParquetMetaDataOptions { // with_skip_encoding_stats add_mutator!(skip_encoding_stats, bool); - /// Skip decoding of `encoding_stats`, but decode the stats for those columns in - /// the provided list of column indices. + /// Skip decoding of [`encoding_stats`] in the Parquet `ColumnMetaData`, but decode the stats + /// for those columns in the provided list of column indices. /// /// This allows for optimizations such as only decoding the page encoding statistics /// for columns present in a predicate. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn set_keep_encoding_stats(&mut self, keep: &[usize]) { if keep.is_empty() { self.set_skip_encoding_stats(true); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 5a27ef9a0e88..9ca7283b5355 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -160,16 +160,22 @@ impl ReadOptionsBuilder { self } - /// Set whether to convert `encoding_stats` to a bitmask. + /// Set whether to convert the [`encoding_stats`] in the Parquet `ColumnMetaData` to a bitmask. /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this /// might be desirable. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self { self.metadata_options.set_encoding_stats_as_mask(val); self } - /// Set whether to skip decoding `encoding_stats`. + /// Set whether to skip decoding the [`encoding_stats`] field of the Parquet `ColumnMetaData`. + /// + /// [`encoding_stats`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917 pub fn with_skip_encoding_stats(mut self, val: bool) -> Self { self.metadata_options.set_skip_encoding_stats(val); self