Skip to content

Commit

Permalink
Move first provider (airbyte) to a separate project
Browse files Browse the repository at this point in the history
This is the first step to move providers to separate
projects inside our monorepo.

Airbyte is a first provider that is separated out to a
project under "providers/<PROVIDER_ID>" directory. This has
the nice property that all files belonging to the same provider
are under a single directory that is part of the Airflow
workspace. For now the code is more complex because we are
handling providers being in either "old" or "new" structure, but
once we move old providers to the new structure, a lot of code
could be removed and simplified.

The new structure for provider code is:

```
providers
        |- PROVIDER_ID
        |            |- src
        |            |    |-airflow
        |            |            |- providers
        |            |                       |- PROVIDER_ID
        |            |- tests
        |            |      |- providers
        |            |                 |- PROVIDER_ID
        |            |- docs
        |            |     |- .latest-doc-only-changes.txt
        |            |- pyproject.toml
        |            |- CHANGELOG.rst
        |            |- provider.yaml
        |            |- README.rst
        |- PROVIDER_ID2
        ...

```
  • Loading branch information
potiuk committed Dec 28, 2024
1 parent 3ad1cf5 commit 1cf19b8
Show file tree
Hide file tree
Showing 77 changed files with 1,538 additions and 738 deletions.
7 changes: 3 additions & 4 deletions .github/boring-cyborg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@

labelPRBasedOnFilePath:
provider:airbyte:
- providers/src/airflow/providers/airbyte/**/*
- docs/apache-airflow-providers-airbyte/**/*
- providers/tests/airbyte/**/*
- providers/tests/system/airbyte/**/*
- providers/airbyte/src/airflow/providers/airbyte/**/*
- providers/airbyte/docs/**/*
- providers/airbyte/tests/**/*

provider:alibaba:
- providers/src/airflow/providers/alibaba/**/*
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ ${{ inputs.do-build == 'true' && inputs.image-tag || '' }}"
- name: "Regenerate dependencies in case they were modified manually so that we can build an image"
shell: bash
run: |
pip install rich>=12.4.4 pyyaml
pip install rich>=12.4.4 pyyaml tomli
python scripts/ci/pre_commit/update_providers_dependencies.py
if: inputs.do-build == 'true' && inputs.upgrade-to-newer-dependencies != 'false'
- name: "Start ARM instance"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/prod-image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ ${{ inputs.do-build == 'true' && inputs.image-tag || '' }}"
- name: "Regenerate dependencies in case they was modified manually so that we can build an image"
shell: bash
run: |
pip install rich>=12.4.4 pyyaml
pip install rich>=12.4.4 pyyaml tomli
python scripts/ci/pre_commit/update_providers_dependencies.py
if: inputs.do-build == 'true' && inputs.upgrade-to-newer-dependencies != 'false'
- name: "Cleanup dist and context file"
Expand Down
12 changes: 8 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,12 @@ repos:
entry: ./scripts/ci/pre_commit/update_providers_init.py
language: python
pass_filenames: true
files: ^providers/[^\/]*/__init__.py$|^providers/[^\/]*/[^\/]*/__init__.py$|^providers/.*/provider.yaml$|^airflow_breeze/templates/PROVIDER__INIT__PY_TEMPLATE.py.jinja2^
files: |
(?x)
^providers/[^\/]*/__init__.py$|
^providers/[^\/]*/[^\/]*/__init__.py$|
^providers/.*/provider.yaml$|
^airflow_breeze/templates/PROVIDER__INIT__PY_TEMPLATE.py.jinja2$
additional_dependencies: ['rich>=12.4.4','requests']
require_serial: true
- id: ruff
Expand Down Expand Up @@ -713,8 +718,7 @@ repos:
^airflow/decorators/.*$|
^airflow/hooks/.*$|
^airflow/operators/.*$|
^providers/src/airflow/providers/.*$|
^providers/src/airflow/providers/standard/sensors/.*$|
^providers/.*$|
^dev/provider_packages/.*$
- id: check-base-operator-usage
language: pygrep
Expand Down Expand Up @@ -1399,7 +1403,7 @@ repos:
name: Validate provider.yaml files
entry: ./scripts/ci/pre_commit/check_provider_yaml_files.py
language: python
files: ^providers/src/airflow/providers/.*/provider\.yaml$
files: ^providers/src/airflow/providers/.*/provider\.yaml$|^providers/.*/src/provider\.yaml$
additional_dependencies: ['rich>=12.4.4']
require_serial: true
- id: check-template-fields-valid
Expand Down
131 changes: 19 additions & 112 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -422,85 +422,6 @@ common::show_packaging_tool_version_and_location
common::install_packaging_tools
EOF

# The content below is automatically copied from scripts/docker/install_airflow_dependencies_from_branch_tip.sh
COPY <<"EOF" /install_airflow_dependencies_from_branch_tip.sh
#!/usr/bin/env bash

. "$( dirname "${BASH_SOURCE[0]}" )/common.sh"

: "${AIRFLOW_REPO:?Should be set}"
: "${AIRFLOW_BRANCH:?Should be set}"
: "${INSTALL_MYSQL_CLIENT:?Should be true or false}"
: "${INSTALL_POSTGRES_CLIENT:?Should be true or false}"

function install_airflow_dependencies_from_branch_tip() {
echo
echo "${COLOR_BLUE}Installing airflow from ${AIRFLOW_BRANCH}. It is used to cache dependencies${COLOR_RESET}"
echo
if [[ ${INSTALL_MYSQL_CLIENT} != "true" ]]; then
AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS/mysql,}
fi
if [[ ${INSTALL_POSTGRES_CLIENT} != "true" ]]; then
AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS/postgres,}
fi
local TEMP_AIRFLOW_DIR
TEMP_AIRFLOW_DIR=$(mktemp -d)
# Install latest set of dependencies - without constraints. This is to download a "base" set of
# dependencies that we can cache and reuse when installing airflow using constraints and latest
# pyproject.toml in the next step (when we install regular airflow).
set -x
curl -fsSL "https://github.com/${AIRFLOW_REPO}/archive/${AIRFLOW_BRANCH}.tar.gz" | \
tar xz -C "${TEMP_AIRFLOW_DIR}" --strip 1
# Make sure editable dependencies are calculated when devel-ci dependencies are installed
${PACKAGING_TOOL_CMD} install ${EXTRA_INSTALL_FLAGS} ${ADDITIONAL_PIP_INSTALL_FLAGS} \
--editable "${TEMP_AIRFLOW_DIR}[${AIRFLOW_EXTRAS}]"
set +x
common::install_packaging_tools
set -x
echo "${COLOR_BLUE}Uninstalling providers. Dependencies remain${COLOR_RESET}"
# Uninstall airflow and providers to keep only the dependencies. In the future when
# planned https://github.com/pypa/pip/issues/11440 is implemented in pip we might be able to use this
# flag and skip the remove step.
pip freeze | grep apache-airflow-providers | xargs ${PACKAGING_TOOL_CMD} uninstall ${EXTRA_UNINSTALL_FLAGS} || true
set +x
echo
echo "${COLOR_BLUE}Uninstalling just airflow. Dependencies remain. Now target airflow can be reinstalled using mostly cached dependencies${COLOR_RESET}"
echo
set +x
${PACKAGING_TOOL_CMD} uninstall ${EXTRA_UNINSTALL_FLAGS} apache-airflow
rm -rf "${TEMP_AIRFLOW_DIR}"
set -x
# If you want to make sure dependency is removed from cache in your PR when you removed it from
# pyproject.toml - please add your dependency here as a list of strings
# for example:
# DEPENDENCIES_TO_REMOVE=("package_a" "package_b")
# Once your PR is merged, you should make a follow-up PR to remove it from this list
# and increase the AIRFLOW_CI_BUILD_EPOCH in Dockerfile.ci to make sure your cache is rebuilt.
local DEPENDENCIES_TO_REMOVE
# IMPORTANT!! Make sure to increase AIRFLOW_CI_BUILD_EPOCH in Dockerfile.ci when you remove a dependency from that list
DEPENDENCIES_TO_REMOVE=()
if [[ "${DEPENDENCIES_TO_REMOVE[*]}" != "" ]]; then
echo
echo "${COLOR_BLUE}Uninstalling just removed dependencies (temporary until cache refreshes)${COLOR_RESET}"
echo "${COLOR_BLUE}Dependencies to uninstall: ${DEPENDENCIES_TO_REMOVE[*]}${COLOR_RESET}"
echo
set +x
${PACKAGING_TOOL_CMD} uninstall "${DEPENDENCIES_TO_REMOVE[@]}" || true
set -x
# make sure that the dependency is not needed by something else
pip check
fi
}

common::get_colors
common::get_packaging_tool
common::get_airflow_version_specification
common::get_constraints_location
common::show_packaging_tool_version_and_location

install_airflow_dependencies_from_branch_tip
EOF

# The content below is automatically copied from scripts/docker/common.sh
COPY <<"EOF" /common.sh
#!/usr/bin/env bash
Expand All @@ -524,17 +445,15 @@ function common::get_packaging_tool() {

## IMPORTANT: IF YOU MODIFY THIS FUNCTION YOU SHOULD ALSO MODIFY CORRESPONDING FUNCTION IN
## `scripts/in_container/_in_container_utils.sh`
local PYTHON_BIN
PYTHON_BIN=$(which python)
if [[ ${AIRFLOW_USE_UV} == "true" ]]; then
echo
echo "${COLOR_BLUE}Using 'uv' to install Airflow${COLOR_RESET}"
echo
export PACKAGING_TOOL="uv"
export PACKAGING_TOOL_CMD="uv pip"
if [[ -z ${VIRTUAL_ENV=} ]]; then
export EXTRA_INSTALL_FLAGS="--python ${PYTHON_BIN}"
export EXTRA_UNINSTALL_FLAGS="--python ${PYTHON_BIN}"
export EXTRA_INSTALL_FLAGS="--system"
export EXTRA_UNINSTALL_FLAGS="--system"
else
export EXTRA_INSTALL_FLAGS=""
export EXTRA_UNINSTALL_FLAGS=""
Expand Down Expand Up @@ -900,18 +819,14 @@ function install_airflow() {
# Determine the installation_command_flags based on AIRFLOW_INSTALLATION_METHOD method
local installation_command_flags
if [[ ${AIRFLOW_INSTALLATION_METHOD} == "." ]]; then
# We need _a_ file in there otherwise the editable install doesn't include anything in the .pth file
mkdir -p ./providers/src/airflow/providers/
touch ./providers/src/airflow/providers/__init__.py

# Similarly we need _a_ file for task_sdk too
mkdir -p ./task_sdk/src/airflow/sdk/
echo '__version__ = "0.0.0dev0"' > ./task_sdk/src/airflow/sdk/__init__.py

trap 'rm -f ./providers/src/airflow/providers/__init__.py ./task_sdk/src/airflow/__init__.py 2>/dev/null' EXIT

# When installing from sources - we always use `--editable` mode
installation_command_flags="--editable .[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION} --editable ./providers --editable ./task_sdk"
# TODO(potiuk) when we move all providers to new structure, we will be able to remove all that and
# Use `uv sync` rather than `uv pip install` rather than finding all pyproject toml / projects here
installation_command_flags="--editable .[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION} --editable ./task_sdk"
while IFS= read -r -d '' pyproject_toml_file; do
project_folder=$(dirname ${pyproject_toml_file})
installation_command_flags="${installation_command_flags} --editable ${project_folder}"
done < <(find "providers" -name "pyproject.toml" -print0)
elif [[ ${AIRFLOW_INSTALLATION_METHOD} == "apache-airflow" ]]; then
installation_command_flags="apache-airflow[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION}"
elif [[ ${AIRFLOW_INSTALLATION_METHOD} == apache-airflow\ @\ * ]]; then
Expand Down Expand Up @@ -1407,7 +1322,8 @@ ARG PYTHON_BASE_IMAGE
ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} \
DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 \
PIP_CACHE_DIR=/tmp/.cache/pip
PIP_CACHE_DIR=/tmp/.cache/pip \
UV_CACHE_DIR=/tmp/.cache/uv

ARG DEV_APT_DEPS=""
ARG ADDITIONAL_DEV_APT_DEPS=""
Expand Down Expand Up @@ -1473,9 +1389,6 @@ ARG DEFAULT_CONSTRAINTS_BRANCH="constraints-main"

# By default PIP has progress bar but you can disable it.
ARG PIP_PROGRESS_BAR
# By default we do not use pre-cached packages, but in CI/Breeze environment we override this to speed up
# builds in case pyproject.toml changed. This is pure optimisation of CI/Breeze builds.
ARG AIRFLOW_PRE_CACHED_PIP_PACKAGES="false"
# This is airflow version that is put in the label of the image build
ARG AIRFLOW_VERSION
# By default latest released version of airflow is installed (when empty) but this value can be overridden
Expand Down Expand Up @@ -1513,7 +1426,6 @@ ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \
AIRFLOW_UV_VERSION=${AIRFLOW_UV_VERSION} \
UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT} \
AIRFLOW_USE_UV=${AIRFLOW_USE_UV} \
AIRFLOW_PRE_CACHED_PIP_PACKAGES=${AIRFLOW_PRE_CACHED_PIP_PACKAGES} \
AIRFLOW_VERSION=${AIRFLOW_VERSION} \
AIRFLOW_INSTALLATION_METHOD=${AIRFLOW_INSTALLATION_METHOD} \
AIRFLOW_VERSION_SPECIFICATION=${AIRFLOW_VERSION_SPECIFICATION} \
Expand All @@ -1538,8 +1450,7 @@ ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \

# Copy all scripts required for installation - changing any of those should lead to
# rebuilding from here
COPY --from=scripts common.sh install_packaging_tools.sh \
install_airflow_dependencies_from_branch_tip.sh create_prod_venv.sh /scripts/docker/
COPY --from=scripts common.sh install_packaging_tools.sh create_prod_venv.sh /scripts/docker/

# We can set this value to true in case we want to install .whl/.tar.gz packages placed in the
# docker-context-files folder. This can be done for both additional packages you want to install
Expand Down Expand Up @@ -1569,13 +1480,7 @@ ENV AIRFLOW_CI_BUILD_EPOCH=${AIRFLOW_CI_BUILD_EPOCH}
# By default PIP installs everything to ~/.local and it's also treated as VIRTUALENV
ENV VIRTUAL_ENV="${AIRFLOW_USER_HOME_DIR}/.local"

RUN bash /scripts/docker/install_packaging_tools.sh; \
bash /scripts/docker/create_prod_venv.sh; \
if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" && \
${INSTALL_PACKAGES_FROM_CONTEXT} == "false" && \
${UPGRADE_INVALIDATION_STRING} == "" ]]; then \
bash /scripts/docker/install_airflow_dependencies_from_branch_tip.sh; \
fi
RUN bash /scripts/docker/install_packaging_tools.sh; bash /scripts/docker/create_prod_venv.sh

COPY --chown=airflow:0 ${AIRFLOW_SOURCES_FROM} ${AIRFLOW_SOURCES_TO}

Expand All @@ -1599,10 +1504,10 @@ COPY --from=scripts install_from_docker_context_files.sh install_airflow.sh \
# an incorrect architecture.
ARG TARGETARCH
# Value to be able to easily change cache id and therefore use a bare new cache
ARG PIP_CACHE_EPOCH="9"
ARG DEPENDENCY_CACHE_EPOCH="9"

# hadolint ignore=SC2086, SC2010, DL3042
RUN --mount=type=cache,id=$PYTHON_BASE_IMAGE-$AIRFLOW_PIP_VERSION-$TARGETARCH-$PIP_CACHE_EPOCH,target=/tmp/.cache/pip,uid=${AIRFLOW_UID} \
RUN --mount=type=cache,id=prod-$TARGETARCH-$DEPENDENCY_CACHE_EPOCH,target=/tmp/.cache/,uid=${AIRFLOW_UID} \
if [[ ${INSTALL_PACKAGES_FROM_CONTEXT} == "true" ]]; then \
bash /scripts/docker/install_from_docker_context_files.sh; \
fi; \
Expand All @@ -1622,7 +1527,7 @@ RUN --mount=type=cache,id=$PYTHON_BASE_IMAGE-$AIRFLOW_PIP_VERSION-$TARGETARCH-$P
# during the build additionally to whatever has been installed so far. It is recommended that
# the requirements.txt contains only dependencies with == version specification
# hadolint ignore=DL3042
RUN --mount=type=cache,id=additional-requirements-$PYTHON_BASE_IMAGE-$AIRFLOW_PIP_VERSION-$TARGETARCH-$PIP_CACHE_EPOCH,target=/tmp/.cache/pip,uid=${AIRFLOW_UID} \
RUN --mount=type=cache,id=prod-$TARGETARCH-$DEPENDENCY_CACHE_EPOCH,target=/tmp/.cache/,uid=${AIRFLOW_UID} \
if [[ -f /docker-context-files/requirements.txt ]]; then \
pip install -r /docker-context-files/requirements.txt; \
fi
Expand Down Expand Up @@ -1650,7 +1555,9 @@ ARG PYTHON_BASE_IMAGE
ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} \
# Make sure noninteractive debian install is used and language variables set
DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 LD_LIBRARY_PATH=/usr/local/lib
LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 LD_LIBRARY_PATH=/usr/local/lib \
PIP_CACHE_DIR=/tmp/.cache/pip \
UV_CACHE_DIR=/tmp/.cache/uv

ARG RUNTIME_APT_DEPS=""
ARG ADDITIONAL_RUNTIME_APT_DEPS=""
Expand Down
Loading

0 comments on commit 1cf19b8

Please sign in to comment.