要解决AWS Glue爬虫无法识别历史文件中的一致CSV模式的问题,可以采取以下步骤:
glue_context = GlueContext(SparkContext.getOrCreate())
database = "your_database_name"
table = "your_table_name"
# 创建一个Glue数据目录
glue_context.create_dynamic_frame.from_catalog(database=database, table_name=table)
crawler_name = "your_crawler_name"
s3_path = "s3://your_bucket/your_folder/"
# 创建一个爬虫
glue_client = boto3.client('glue')
response = glue_client.create_crawler(
Name=crawler_name,
Role='your_crawler_role',
DatabaseName=database,
Targets={
'S3Targets': [
{
'Path': s3_path
},
]
}
)
# 启动爬虫
glue_client.start_crawler(Name=crawler_name)
# 等待爬虫完成
crawler_state = glue_client.get_crawler(Name=crawler_name)
while crawler_state['Crawler']['State'] == 'RUNNING':
time.sleep(10)
crawler_state = glue_client.get_crawler(Name=crawler_name)
# 根据爬虫识别的模式创建表
response = glue_client.get_table(DatabaseName=database, Name=table)
table_input = response['Table']
table_input['TableType'] = 'EXTERNAL_TABLE'
response = glue_client.create_table(
DatabaseName=database,
TableInput=table_input
)
这样,Glue爬虫就会识别历史文件中的一致CSV模式,并更新表的模式。