Skip to content

Commit f004a1f

Browse files
committed
add some documentation
1 parent 49753b7 commit f004a1f

File tree

4 files changed

+54
-10
lines changed

4 files changed

+54
-10
lines changed

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,12 @@ impl ArrowReaderOptions {
519519
}
520520

521521
/// Set whether to convert `encoding_stats` to a bitmask.
522+
///
523+
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
524+
/// might be desirable.
525+
///
526+
/// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
527+
/// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
522528
pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
523529
self.metadata_options.set_encoding_stats_as_mask(val);
524530
self

parquet/src/file/metadata/mod.rs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,18 +1051,39 @@ impl ColumnChunkMetaData {
10511051
self.geo_statistics.as_deref()
10521052
}
10531053

1054-
/// Returns the page encoding stats,
1055-
/// or `None` if no page encoding stats are available.
1054+
/// Returns the page encoding statistics, or `None` if no page encoding statistics
1055+
/// are available.
10561056
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
10571057
self.encoding_stats.as_ref()
10581058
}
10591059

1060-
/// Returns the page encoding stats reduced to a bitmask.
1060+
/// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1061+
/// not available.
10611062
///
1062-
/// Decoding the full page encoding statistics can be costly, and is not always necessary.
1063-
/// This field contains a mask of all encodings used for data pages. This can still support
1064-
/// some uses of the full statistics, such as determining if all data pages are dictionary
1065-
/// encoded.
1063+
/// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1064+
/// enable fast determination of whether all pages in a column chunk are dictionary encoded
1065+
/// (see <https://github.com/apache/parquet-format/pull/16>).
1066+
/// Decoding the full page encoding statistics, however, can be very costly, and is not
1067+
/// necessary to support the aforementioned use case. As an alternative, this crate can
1068+
/// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1069+
/// used for data pages
1070+
/// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1071+
/// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1072+
///
1073+
/// ```rust
1074+
/// use parquet::basic::Encoding;
1075+
/// use parquet::file::metadata::ColumnChunkMetaData;
1076+
/// // test if all data pages in the column chunk are dictionary encoded
1077+
/// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1078+
/// // check that dictionary encoding was used
1079+
/// col_meta.dictionary_page_offset().is_some()
1080+
/// && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1081+
/// // mask should only have one bit set, either for PLAIN_DICTIONARY or
1082+
/// // RLE_DICTIONARY
1083+
/// mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1084+
/// })
1085+
/// }
1086+
/// ```
10661087
pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
10671088
self.encoding_stats_mask.as_ref()
10681089
}

parquet/src/file/metadata/options.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,24 @@ impl ParquetMetaDataOptions {
7575

7676
/// Returns whether to present the `encoding_stats` field of the `ColumnMetaData` as a
7777
/// bitmask.
78+
///
79+
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
80+
/// might be desirable.
81+
///
82+
/// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
83+
/// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
7884
pub fn encoding_stats_as_mask(&self) -> bool {
7985
self.encoding_stats_as_mask
8086
}
8187

8288
/// Convert `encoding_stats` from a vector of [`PageEncodingStats`] to a bitmask. This can
8389
/// speed up metadata decoding while still enabling some use cases served by the full stats.
8490
///
91+
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information.
92+
///
8593
/// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
94+
/// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
95+
/// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
8696
pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
8797
self.encoding_stats_as_mask = val;
8898
}
@@ -100,16 +110,20 @@ impl ParquetMetaDataOptions {
100110
.is_some_and(|oset| oset.as_ref().is_none_or(|keep| !keep.contains(&col_index)))
101111
}
102112

103-
/// Skip decoding of all `encoding_stats`. Takes precedence over `encoding_stats_as_mask`.
113+
/// Skip decoding of all `encoding_stats`. Takes precedence over
114+
/// [`Self::encoding_stats_as_mask`].
104115
pub fn set_skip_encoding_stats(&mut self, val: bool) {
105116
self.skip_encoding_stats = if val { Some(None) } else { None };
106117
}
107118

108119
// with_skip_encoding_stats
109120
add_mutator!(skip_encoding_stats, bool);
110121

111-
/// Skip decoding of `encoding_stats`, but decode the stats for those column in
112-
/// provided list of column indices.
122+
/// Skip decoding of `encoding_stats`, but decode the stats for those columns in
123+
/// the provided list of column indices.
124+
///
125+
/// This allows for optimizations such as only decoding the page encoding statistics
126+
/// for columns present in a predicate.
113127
pub fn set_keep_encoding_stats(&mut self, keep: &[usize]) {
114128
if keep.is_empty() {
115129
self.set_skip_encoding_stats(true);

parquet/src/file/serialized_reader.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ impl ReadOptionsBuilder {
161161
}
162162

163163
/// Set whether to convert `encoding_stats` to a bitmask.
164+
///
165+
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
166+
/// might be desirable.
164167
pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
165168
self.metadata_options.set_encoding_stats_as_mask(val);
166169
self

0 commit comments

Comments
 (0)