iceberg表文件结构
iceberg库表层次关系如下图所示:

表目录结构如下图所示:

表空间目录下,数据存放在data子目录,元数据存放在metadata子目录。
案例
一张iceberg表的文件组织结构如下所示:
/data/warehouse/iceberg_test.db/test001/data
/data/warehouse/iceberg_test.db/test001/metadata数据目录(data)
数据目录结构如下所示:
/data/warehouse/iceberg_test.db/test001/data/00000-1-0e9ae40d-250d-4525-b140-3f50e1427941-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-3-c40c9d93-45ec-43b0-8046-cbd3d23ec3a1-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-5-1a9166f6-2b8a-4f9c-b2a6-a948ed0968f9-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-7-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-9-610d6cea-c212-47ed-b015-be6704480ee7-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-10-610d6cea-c212-47ed-b015-be6704480ee7-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-2-0e9ae40d-250d-4525-b140-3f50e1427941-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-4-c40c9d93-45ec-43b0-8046-cbd3d23ec3a1-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-6-1a9166f6-2b8a-4f9c-b2a6-a948ed0968f9-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-8-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet如果设置了分区,则包含分区子目录。第一次隐式分区也有对应的分区子目录。
如果没设置分区,则所有的数据文件都在data目录下。
元数据目录(metadata)
元数据目录如下所示:
# metadata json file
/data/warehouse/iceberg_test.db/test001/metadata/00000-9b04bd23-c3c4-4e05-ab15-720d9a070375.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00001-bc385dea-3727-4744-8876-b689c79435d0.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00002-5065399d-65dc-4604-a952-7f8427d4cbcd.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00003-ed2629ef-5032-4996-9eb4-05836bf5a01e.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00004-b2cb2152-d94e-42dd-a7ec-f98f152d8643.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00005-040c65a8-c8bc-430b-96b8-a6f14fe01104.metadata.json
# manifest file
/data/warehouse/iceberg_test.db/test001/metadata/63340959-0aeb-4652-b735-e3a75cac1ff6-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/68108928-2b8f-44c1-9942-9a1e57d24473-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/a0d388c7-83c4-49a6-9a32-b058e2e4f919-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711-m0.avro
# manifest list file
/data/warehouse/iceberg_test.db/test001/metadata/snap-2106022616408784606-1-63340959-0aeb-4652-b735-e3a75cac1ff6.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-3652109360786887717-1-c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-3979142577400722791-1-68108928-2b8f-44c1-9942-9a1e57d24473.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-7334379460333501439-1-a0d388c7-83c4-49a6-9a32-b058e2e4f919.avrometadata文件
文件名格式为“{五位数字}-{UUID}.metadata.json”
文件内容例如
{
  "format-version" : 2,
  "table-uuid" : "ea5ee732-3cdf-4b86-9732-49ab4e23600a",
  "location" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001",
  "last-sequence-number" : 5,
  "last-updated-ms" : 1758803845006,
  "last-column-id" : 2,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : true,
      "type" : "long",
      "doc" : "unique id"
    }, {
      "id" : 2,
      "name" : "data",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ ]
  } ],
  "last-partition-id" : 999,
  "default-sort-order-id" : 0,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  } ],
  "properties" : {
    "owner" : "hadoop",
    "write.parquet.compression-codec" : "zstd"
  },
  "current-snapshot-id" : 2106022616408784606,
  "refs" : {
    "main" : {
      "snapshot-id" : 2106022616408784606,
      "type" : "branch"
    }
  },
  "snapshots" : [ {
    "sequence-number" : 1,
    "snapshot-id" : 7334379460333501439,
    "timestamp-ms" : 1758803839578,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1758599307040_0004",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1288",
      "changed-partition-count" : "1",
      "total-records" : "2",
      "total-files-size" : "1288",
      "total-data-files" : "2",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.1",
      "app-id" : "application_1758599307040_0004",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.8.1"
    },
    "manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-7334379460333501439-1-a0d388c7-83c4-49a6-9a32-b058e2e4f919.avro",
    "schema-id" : 0
  }, {
    "sequence-number" : 2,
    "snapshot-id" : 3979142577400722791,
    "parent-snapshot-id" : 7334379460333501439,
    "timestamp-ms" : 1758803841054,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1758599307040_0004",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1302",
      "changed-partition-count" : "1",
      "total-records" : "4",
      "total-files-size" : "2590",
      "total-data-files" : "4",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.1",
      "app-id" : "application_1758599307040_0004",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.8.1"
    },
    "manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-3979142577400722791-1-68108928-2b8f-44c1-9942-9a1e57d24473.avro",
    "schema-id" : 0
  }, {
    "sequence-number" : 3,
    "snapshot-id" : 3652109360786887717,
    "parent-snapshot-id" : 3979142577400722791,
    "timestamp-ms" : 1758803842388,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1758599307040_0004",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1316",
      "changed-partition-count" : "1",
      "total-records" : "6",
      "total-files-size" : "3906",
      "total-data-files" : "6",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.1",
      "app-id" : "application_1758599307040_0004",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.8.1"
    },
    "manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-3652109360786887717-1-c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711.avro",
    "schema-id" : 0
  }, {
    "sequence-number" : 4,
    "snapshot-id" : 713236168318586475,
    "parent-snapshot-id" : 3652109360786887717,
    "timestamp-ms" : 1758803843700,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1758599307040_0004",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1330",
      "changed-partition-count" : "1",
      "total-records" : "8",
      "total-files-size" : "5236",
      "total-data-files" : "8",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.1",
      "app-id" : "application_1758599307040_0004",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.8.1"
    },
    "manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro",
    "schema-id" : 0
  }, {
    "sequence-number" : 5,
    "snapshot-id" : 2106022616408784606,
    "parent-snapshot-id" : 713236168318586475,
    "timestamp-ms" : 1758803845006,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1758599307040_0004",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1344",
      "changed-partition-count" : "1",
      "total-records" : "10",
      "total-files-size" : "6580",
      "total-data-files" : "10",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.1",
      "app-id" : "application_1758599307040_0004",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.8.1"
    },
    "manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-2106022616408784606-1-63340959-0aeb-4652-b735-e3a75cac1ff6.avro",
    "schema-id" : 0
  } ],
  "statistics" : [ ],
  "partition-statistics" : [ ],
  "snapshot-log" : [ {
    "timestamp-ms" : 1758803839578,
    "snapshot-id" : 7334379460333501439
  }, {
    "timestamp-ms" : 1758803841054,
    "snapshot-id" : 3979142577400722791
  }, {
    "timestamp-ms" : 1758803842388,
    "snapshot-id" : 3652109360786887717
  }, {
    "timestamp-ms" : 1758803843700,
    "snapshot-id" : 713236168318586475
  }, {
    "timestamp-ms" : 1758803845006,
    "snapshot-id" : 2106022616408784606
  } ],
  "metadata-log" : [ {
    "timestamp-ms" : 1758803828850,
    "metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00000-9b04bd23-c3c4-4e05-ab15-720d9a070375.metadata.json"
  }, {
    "timestamp-ms" : 1758803839578,
    "metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00001-bc385dea-3727-4744-8876-b689c79435d0.metadata.json"
  }, {
    "timestamp-ms" : 1758803841054,
    "metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00002-5065399d-65dc-4604-a952-7f8427d4cbcd.metadata.json"
  }, {
    "timestamp-ms" : 1758803842388,
    "metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00003-ed2629ef-5032-4996-9eb4-05836bf5a01e.metadata.json"
  }, {
    "timestamp-ms" : 1758803843700,
    "metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00004-b2cb2152-d94e-42dd-a7ec-f98f152d8643.metadata.json"
  } ]
}manifest list文件
文件名格式为:snap-{snapshotID}-{attemptID}-{commitUUID}.avro
manifest list文件记录了manifest file和统计信息。
文件内容如下所示:
{
    "manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro",
    "manifest_length": 6727,
    "partition_spec_id": 0,
    "content": 0,
    "sequence_number": 4,
    "min_sequence_number": 4,
    "added_snapshot_id": 713236168318586475,
    "added_files_count": 2,
    "existing_files_count": 0,
    "deleted_files_count": 0,
    "added_rows_count": 2,
    "existing_rows_count": 0,
    "deleted_rows_count": 0,
    "partitions": {
        "array": [
        ]
    },
    "key_metadata": null
}
{
    "manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711-m0.avro",
    "manifest_length": 6728,
    "partition_spec_id": 0,
    "content": 0,
    "sequence_number": 3,
    "min_sequence_number": 3,
    "added_snapshot_id": 3652109360786887717,
    "added_files_count": 2,
    "existing_files_count": 0,
    "deleted_files_count": 0,
    "added_rows_count": 2,
    "existing_rows_count": 0,
    "deleted_rows_count": 0,
    "partitions": {
        "array": [
        ]
    },
    "key_metadata": null
}
{
    "manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/68108928-2b8f-44c1-9942-9a1e57d24473-m0.avro",
    "manifest_length": 6726,
    "partition_spec_id": 0,
    "content": 0,
    "sequence_number": 2,
    "min_sequence_number": 2,
    "added_snapshot_id": 3979142577400722791,
    "added_files_count": 2,
    "existing_files_count": 0,
    "deleted_files_count": 0,
    "added_rows_count": 2,
    "existing_rows_count": 0,
    "deleted_rows_count": 0,
    "partitions": {
        "array": [
        ]
    },
    "key_metadata": null
}
{
    "manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/a0d388c7-83c4-49a6-9a32-b058e2e4f919-m0.avro",
    "manifest_length": 6722,
    "partition_spec_id": 0,
    "content": 0,
    "sequence_number": 1,
    "min_sequence_number": 1,
    "added_snapshot_id": 7334379460333501439,
    "added_files_count": 2,
    "existing_files_count": 0,
    "deleted_files_count": 0,
    "added_rows_count": 2,
    "existing_rows_count": 0,
    "deleted_rows_count": 0,
    "partitions": {
        "array": [
        ]
    },
    "key_metadata": null
}通过以下命令可查看一个avro文件
java -jar avro-tools-1.12.0.jar tojson snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro输出结果为一行一行的数据,每一行都是一个json字符串,上述案例把json字符串展开了。
manifest file文件
文件名格式为:{commitUUID}-m{manifestCount}.avro
manifest file记录了data文件和统计信息。
文件内容如下所示:
{
    "status": 1,
    "snapshot_id": {
        "long": 713236168318586475
    },
    "sequence_number": null,
    "file_sequence_number": null,
    "data_file": {
        "content": 0,
        "file_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/data/00000-7-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet",
        "file_format": "PARQUET",
        "partition": {
        },
        "record_count": 1,
        "file_size_in_bytes": 665,
        "column_sizes": {
            "array": [
                {
                    "key": 1,
                    "value": 40
                },
                {
                    "key": 2,
                    "value": 40
                }
            ]
        },
        "value_counts": {
            "array": [
                {
                    "key": 1,
                    "value": 1
                },
                {
                    "key": 2,
                    "value": 1
                }
            ]
        },
        "null_value_counts": {
            "array": [
                {
                    "key": 1,
                    "value": 0
                },
                {
                    "key": 2,
                    "value": 0
                }
            ]
        },
        "nan_value_counts": {
            "array": [
            ]
        },
        "lower_bounds": {
            "array": [
                {
                    "key": 1,
                    "value": "W\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
                },
                {
                    "key": 2,
                    "value": "gggg"
                }
            ]
        },
        "upper_bounds": {
            "array": [
                {
                    "key": 1,
                    "value": "W\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
                },
                {
                    "key": 2,
                    "value": "gggg"
                }
            ]
        },
        "key_metadata": null,
        "split_offsets": {
            "array": [
                4
            ]
        },
        "equality_ids": null,
        "sort_order_id": {
            "int": 0
        }
    }
}
{
    "status": 1,
    "snapshot_id": {
        "long": 713236168318586475
    },
    "sequence_number": null,
    "file_sequence_number": null,
    "data_file": {
        "content": 0,
        "file_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/data/00001-8-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet",
        "file_format": "PARQUET",
        "partition": {
        },
        "record_count": 1,
        "file_size_in_bytes": 665,
        "column_sizes": {
            "array": [
                {
                    "key": 1,
                    "value": 40
                },
                {
                    "key": 2,
                    "value": 40
                }
            ]
        },
        "value_counts": {
            "array": [
                {
                    "key": 1,
                    "value": 1
                },
                {
                    "key": 2,
                    "value": 1
                }
            ]
        },
        "null_value_counts": {
            "array": [
                {
                    "key": 1,
                    "value": 0
                },
                {
                    "key": 2,
                    "value": 0
                }
            ]
        },
        "nan_value_counts": {
            "array": [
            ]
        },
        "lower_bounds": {
            "array": [
                {
                    "key": 1,
                    "value": "X\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
                },
                {
                    "key": 2,
                    "value": "hhhh"
                }
            ]
        },
        "upper_bounds": {
            "array": [
                {
                    "key": 1,
                    "value": "X\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
                },
                {
                    "key": 2,
                    "value": "hhhh"
                }
            ]
        },
        "key_metadata": null,
        "split_offsets": {
            "array": [
                4
            ]
        },
        "equality_ids": null,
        "sort_order_id": {
            "int": 0
        }
    }
}通过以下命令读取avro信息
java -jar avro-tools-1.12.0.jar tojson a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro