From 8de1d357ea0f3150da3aa27043ac9b0bf4763320 Mon Sep 17 00:00:00 2001 From: Sky Moore Date: Tue, 14 Oct 2025 14:10:37 +0100 Subject: [PATCH] fix: use cloudscraper to get shortage data --- airflow/dags/ashp/dag.py | 16 ++++++++++++++-- airflow/requirements.txt | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/airflow/dags/ashp/dag.py b/airflow/dags/ashp/dag.py index bd40095..444871c 100644 --- a/airflow/dags/ashp/dag.py +++ b/airflow/dags/ashp/dag.py @@ -6,6 +6,7 @@ from time import sleep import requests +import cloudscraper from bs4 import BeautifulSoup import pandas as pd @@ -45,11 +46,22 @@ def extract_load_shortage_list(): logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s') + # Use cloudscraper to bypass Cloudflare protection + scraper = cloudscraper.create_scraper( + browser={ + 'browser': 'chrome', + 'platform': 'windows', + 'mobile': False + } + ) + logging.info('Checking ASHP website for updates') - shortage_list = requests.get(landing_url) + shortage_list = scraper.get(landing_url) if shortage_list.status_code != 200: logging.error('ASHP website unreachable') + logging.error(f'Status code: {shortage_list.status_code}') + logging.error(f'Response: {shortage_list.text}') exit() ashp_drugs = [] @@ -64,7 +76,7 @@ def extract_load_shortage_list(): available_ndcs = [] for shortage in ashp_drugs: - shortage_detail_data = requests.get(base_url + shortage['detail_url']) + shortage_detail_data = scraper.get(base_url + shortage['detail_url']) soup = BeautifulSoup(shortage_detail_data.content, 'html.parser') # Get shortage reasons diff --git a/airflow/requirements.txt b/airflow/requirements.txt index b5e441b..befb6ae 100644 --- a/airflow/requirements.txt +++ b/airflow/requirements.txt @@ -7,3 +7,4 @@ apache-airflow[google] bs4 openpyxl xlrd==1.2.0 +cloudscraper