iceberg表文件结构


发布于 2025-06-03 / 4 阅读 / 0 评论 /
一张iceberg表的文件结构

iceberg表文件结构

一张iceberg表的文件组织结构如下所示:

[root@abc-10-10-10-11 ~]# hadoop fs -ls /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition
Found 2 items
drwxr-xr-x   - hadoop supergroup          0 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/data
drwxr-xr-x   - hadoop supergroup          0 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata

数据目录(data)

数据目录结构如下所示:

[root@abc-10-10-10-11 ~]# hadoop fs -ls /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/data
Found 1 items
drwxr-xr-x   - hadoop supergroup          0 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/data/dt=20220501
[root@abc-10-10-10-11 ~]# hadoop fs -ls /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/data/dt=20220501
Found 1 items
-rw-r--r--   3 hadoop supergroup        670 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/data/dt=20220501/00000-1-131a29fa-741a-4c85-91c4-1fd23a23578a-0-00001.parquet

元数据目录(metadata)

元数据目录如下所示:

[root@abc-10-10-10-11 ~]# hadoop fs -ls /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata
Found 4 items
-rw-r--r--   3 hadoop supergroup       1176 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/00000-40c5199d-b380-4675-afdb-74e028ea6559.metadata.json
-rw-r--r--   3 hadoop supergroup       2588 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/00001-fd678b94-eaf3-4993-bcb6-5c90fac4fdfc.metadata.json
-rw-r--r--   3 hadoop supergroup       6914 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/eb1817e4-01f3-401e-a0d5-2a62e907dbda-m0.avro
-rw-r--r--   3 hadoop supergroup       4498 2025-06-02 12:03 /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/snap-4502910375816569575-1-eb1817e4-01f3-401e-a0d5-2a62e907dbda.avro

元数据版本(00000-{UUID}.metadata.json)

[root@abc-10-10-10-11 ~]# hadoop fs -cat /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/00000-40c5199d-b380-4675-afdb-74e028ea6559.metadata.json
{
  "format-version" : 2,
  "table-uuid" : "68f5b114-bcac-4ea2-94b4-b0ef0697217c",
  "location" : "hdfs://HDFS78000003/data/hive/warehouse/iceberg_test_db.db/spark_hive_partition",
  "last-sequence-number" : 0,
  "last-updated-ms" : 1750651393777,
  "last-column-id" : 2,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : false,
      "type" : "long"
    }, {
      "id" : 2,
      "name" : "dt",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ {
      "name" : "dt",
      "transform" : "identity",
      "source-id" : 2,
      "field-id" : 1000
    } ]
  } ],
  "last-partition-id" : 1000,
  "default-sort-order-id" : 0,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  } ],
  "properties" : {
    "owner" : "hadoop",
    "write.parquet.compression-codec" : "zstd"
  },
  "current-snapshot-id" : -1,
  "refs" : { },
  "snapshots" : [ ],
  "statistics" : [ ],
  "partition-statistics" : [ ],
  "snapshot-log" : [ ],
  "metadata-log" : [ ]
}

元数据版本(00001-{UUID}.metadata.json)

[root@abc-10-10-10-11 ~]# hadoop fs -cat /data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/00001-fd678b94-eaf3-4993-bcb6-5c90fac4fdfc.metadata.json
{
  "format-version" : 2,
  "table-uuid" : "68f5b114-bcac-4ea2-94b4-b0ef0697217c",
  "location" : "hdfs://ABC0000001/usr/hive/warehouse/iceberg_autotest_db.db/spark_hive_partition",
  "last-sequence-number" : 1,
  "last-updated-ms" : 1750651398083,
  "last-column-id" : 2,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : false,
      "type" : "long"
    }, {
      "id" : 2,
      "name" : "dt",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ {
      "name" : "dt",
      "transform" : "identity",
      "source-id" : 2,
      "field-id" : 1000
    } ]
  } ],
  "last-partition-id" : 1000,
  "default-sort-order-id" : 0,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  } ],
  "properties" : {
    "owner" : "hadoop",
    "write.parquet.compression-codec" : "zstd"
  },
  "current-snapshot-id" : 4502910375816569575,
  "refs" : {
    "main" : {
      "snapshot-id" : 4502910375816569575,
      "type" : "branch"
    }
  },
  "snapshots" : [ {
    "sequence-number" : 1,
    "snapshot-id" : 4502910375816569575,
    "timestamp-ms" : 1750651398083,
    "summary" : {
      "operation" : "append",
      "spark.app.id" : "application_1750612138448_0357",
      "added-data-files" : "1",
      "added-records" : "1",
      "added-files-size" : "670",
      "changed-partition-count" : "1",
      "total-records" : "1",
      "total-files-size" : "670",
      "total-data-files" : "1",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0",
      "engine-version" : "3.5.2",
      "app-id" : "application_1750612138448_0357",
      "engine-name" : "spark",
      "iceberg-version" : "Apache Iceberg 1.6.1"
    },
    "manifest-list" : "hdfs://ABC0000001/data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/snap-4502910375816569575-1-eb1817e4-01f3-401e-a0d5-2a62e907dbda.avro",
    "schema-id" : 0
  } ],
  "statistics" : [ ],
  "partition-statistics" : [ ],
  "snapshot-log" : [ {
    "timestamp-ms" : 1750651398083,
    "snapshot-id" : 4502910375816569575
  } ],
  "metadata-log" : [ {
    "timestamp-ms" : 1750651392777,
    "metadata-file" : "hdfs://ABC0000001/data/hive/warehouse/iceberg_test_db.db/spark_hive_partition/metadata/00000-40c5199d-b380-4675-afdb-74e028ea6559.metadata.json"
  } ]
}