在使用AWS Glue时,可能会遇到书签会产生重复的问题。为了解决这个问题,你可以尝试以下方法:
方法一:增加并行任务数限制
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
# 初始化SparkContext和GlueContext
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
# 获取作业参数
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
# 设置并行任务数限制
spark.conf.set("spark.sql.shuffle.partitions", "100")
# 书签位置
bookmard_path = "s3://your-bucket/your-path/bookmark/"
# 创建DynamicFrame
datasource = glueContext.create_dynamic_frame.from_catalog(database = "your-database", table_name = "your-table-name", transformation_ctx = "datasource")
# 进行转换操作
# ...
# 写出结果
glueContext.write_dynamic_frame.from_options(frame = datasource, connection_type = "s3", connection_options = {"path": "s3://your-bucket/your-path/output/"}, format = "parquet", transformation_ctx = "datasink")
# 更新书签
job.commit()
方法二:手动管理书签
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from awsglue.job import Job
# 初始化SparkContext和GlueContext
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
# 获取作业参数
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
# 书签位置
bookmark_path = "s3://your-bucket/your-path/bookmark/"
# 读取之前的书签
bookmark = spark.read.option("header", "true").csv(bookmark_path)
# 创建DynamicFrame
datasource = glueContext.create_dynamic_frame.from_catalog(database = "your-database", table_name = "your-table-name", transformation_ctx = "datasource")
# 进行转换操作
# ...
# 更新书签
datasource.toDF().write.mode("overwrite").csv(bookmark_path)
# 写出结果
glueContext.write_dynamic_frame.from_options(frame = datasource, connection_type = "s3", connection_options = {"path": "s3://your-bucket/your-path/output/"}, format = "parquet", transformation_ctx = "datasink")
# 更新书签
job.commit()
这些方法可以帮助你解决AWS Glue书签会产生重复的问题。你可以根据具体情况选择适合的方法来处理书签。