在 Glue Catalog 中建立表格可以通过以下步骤来完成:
import boto3
import botocore.exceptions
import pandas as pd
import awswrangler as wr
glue_client = boto3.client('glue')
table_columns = [
{"Name": "id", "Type": "int"},
{"Name": "name", "Type": "string"},
{"Name": "age", "Type": "int"}
]
# 如果有分区,可以用以下方式定义分区 Schema:
# partition_keys = [{"Name": "date", "Type": "string"}]
database_name = 'your-database-name'
table_name = 'your-table-name'
path_to_data = 's3://bucket-name/path/to/data'
# 如果存在分区属性,可以使用以下方式定义:
# partition_keys = [{"Name": "date", "Type": "string"}]
try:
response = glue_client.create_table(
DatabaseName=database_name,
TableInput={
"Name": table_name,
"StorageDescriptor": {
"Columns": table_columns,
"Location": path_to_data,
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"Parameters": {
"serialization.format": ","
}
},
"BucketColumns": [],
"SortColumns": [],
"Parameters": {}
}
}
)
print("Table is successfully created")
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == 'AlreadyExistsException':
print(f"Table {table_name} already exists")
else