这可能是因为您在BeamRunPythonPipelineOperator任务中使用的Python环境与您的Airflow环境不兼容或不同。您可以尝试将BeamRunPythonPipelineOperator所使用的Python3.x环境转移到您的Airflow环境中。
以下是代码示例,展示如何定义并转移BeamRunPythonPipelineOperator所使用的Python环境:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
from google.cloud import dataflow
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 1, 1),
'email': ['airflow@example.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag = DAG('beam_python_operator_example_dag', default_args=default_args, schedule_interval='@once')
# 定义要在BeamRunPythonPipelineOperator中使用的Python环境
beam_pipeline_env = {
'test_env': 'gcp-dataflow-py37',
'options': {
'project': 'my-gcp-project',
'region': 'my-gcp-region'
},
'py_requirements': 'requirements.txt'
}
def run_beam_pipeline(**kwargs):
"""使用BeamRunPythonPipelineOperator运行某个Python文件"""
options = kwargs['dag_run'].conf['dataflow_pipeline_options']
p = dataflow.BeamPipelineRunner(**options)
p.run()
with dag:
# 转移Python3.x环境并使用DataFlowPythonOperator任务
beam_pipeline_task = DataFlowPythonOperator(
task_id='beam_python_task',
py_file='/home/airflow/dataflow_scripts/my_beam_pipeline.py',
default_args=default_args,
dag=dag,
gcp_conn_id='my_gcp_connection',
dataflow_default_options=beam_pipeline_env
)
# 运行Beam管道
run_beam_pipeline_task = PythonOperator(
task_id='run_beam_pipeline_task',
python_callable=run
上一篇:BeamPythonDataflow在提供模式的情况下写入BigQuery表时,抛出“AttributeError:Maynotassignarbitraryvaluetpetomessage”错误。