From 8de1d357ea0f3150da3aa27043ac9b0bf4763320 Mon Sep 17 00:00:00 2001
From: Sky Moore <i@msky.me>
Date: Tue, 14 Oct 2025 14:10:37 +0100
Subject: [PATCH] fix: use cloudscraper to get shortage data

---
 airflow/dags/ashp/dag.py | 16 ++++++++++++++--
 airflow/requirements.txt |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/airflow/dags/ashp/dag.py b/airflow/dags/ashp/dag.py
index bd40095..444871c 100644
--- a/airflow/dags/ashp/dag.py
+++ b/airflow/dags/ashp/dag.py
@@ -6,6 +6,7 @@
 from time import sleep
 
 import requests
+import cloudscraper
 from bs4 import BeautifulSoup
 import pandas as pd
 
@@ -45,11 +46,22 @@
     def extract_load_shortage_list():
         logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
 
+        # Use cloudscraper to bypass Cloudflare protection
+        scraper = cloudscraper.create_scraper(
+            browser={
+                'browser': 'chrome',
+                'platform': 'windows',
+                'mobile': False
+            }
+        )
+        
         logging.info('Checking ASHP website for updates')
-        shortage_list = requests.get(landing_url)
+        shortage_list = scraper.get(landing_url)
 
         if shortage_list.status_code != 200:
             logging.error('ASHP website unreachable')
+            logging.error(f'Status code: {shortage_list.status_code}')
+            logging.error(f'Response: {shortage_list.text}')
             exit()
 
         ashp_drugs = []
@@ -64,7 +76,7 @@ def extract_load_shortage_list():
         available_ndcs = []
 
         for shortage in ashp_drugs:
-            shortage_detail_data = requests.get(base_url + shortage['detail_url'])
+            shortage_detail_data = scraper.get(base_url + shortage['detail_url'])
             soup = BeautifulSoup(shortage_detail_data.content, 'html.parser')
 
             # Get shortage reasons
diff --git a/airflow/requirements.txt b/airflow/requirements.txt
index b5e441b..befb6ae 100644
--- a/airflow/requirements.txt
+++ b/airflow/requirements.txt
@@ -7,3 +7,4 @@ apache-airflow[google]
 bs4
 openpyxl
 xlrd==1.2.0
+cloudscraper