From c72e2b334adba98c2fcbe995cb189688f98e5edb Mon Sep 17 00:00:00 2001 From: Anirudh Hegde Date: Wed, 27 Dec 2023 20:27:26 +0530 Subject: [PATCH] Update scrape.py --- scrape.py | 87 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 34 deletions(-) diff --git a/scrape.py b/scrape.py index a82e7be..ebd5ce0 100644 --- a/scrape.py +++ b/scrape.py @@ -2,41 +2,60 @@ from selenium.webdriver.common.by import By import json import time -from scrappers.devfolio import Devfolio +hackathons = [] +driver = None def scrape_data(): - options = webdriver.ChromeOptions() - options.add_argument('--ignore-ssl-errors=yes') - options.add_argument('--ignore-certificate-errors') - driver = webdriver.Remote( - command_executor='http://localhost:4444/wd/hub', - options=options - ) - driver.get("https://mlh.io/seasons/2024/events") - links = driver.find_elements(By.CLASS_NAME, "event-link") # Fetching links of hackathon websites - images = driver.find_elements(By.CSS_SELECTOR, ".image-wrap img") # Fetching images of hackathons - names = driver.find_elements(By.CLASS_NAME, "event-name") # Fetching names of hackathons - dates = driver.find_elements(By.CLASS_NAME, "event-date") # Fetching dates - locs = driver.find_elements(By.CLASS_NAME, "event-location") # Fetching location - modes = driver.find_elements(By.CLASS_NAME,"event-hybrid-notes") # Fetching modes - hackathons = [] - for i in range(len(links)): - hackathon = { - "name": names[i].text, - "link": links[i].get_attribute("href"), - "image": images[i].get_attribute("src"), - "date": dates[i].text, - "location": locs[i].text, - "mode":modes[i].text - } - hackathons.append(hackathon) - with open("hackathons.json","w") as f: - json.dump(hackathons,f,indent=4); - driver.quit() + global hackathons, driver -# scrape_data() -while(1): + # Update this path with the correct path of your system + # chrome_path = "/usr/bin/google-chrome" - # Linux and MacOS + chrome_path="/path/to/google-chrome" + # install chromedriver version compatible with yoyr google-chrome version + + # Update this path with the correct path of your system + + #chromedriver_path = "/usr/local/bin/chromedriver" - # Linux and MacOS + chromedriver_path = "/path/to/chromedriver" + + try: + options = webdriver.ChromeOptions() + options.add_argument('--ignore-ssl-errors=yes') + options.add_argument('--ignore-certificate-errors') + + driver = webdriver.Chrome(options=options) + + driver.get("https://mlh.io/seasons/2024/events") + links = driver.find_elements(By.CLASS_NAME, "event-link") # Fetching links of hackathon websites + images = driver.find_elements(By.CSS_SELECTOR, ".image-wrap img") # Fetching images of hackathons + names = driver.find_elements(By.CLASS_NAME, "event-name") # Fetching names of hackathons + dates = driver.find_elements(By.CLASS_NAME, "event-date") # Fetching dates + locs = driver.find_elements(By.CLASS_NAME, "event-location") # Fetching location + modes = driver.find_elements(By.CLASS_NAME,"event-hybrid-notes") # Fetching modes + hackathons = [] + for i in range(len(links)): + hackathon = { + "name": names[i].text, + "link": links[i].get_attribute("href"), + "image": images[i].get_attribute("src"), + "date": dates[i].text, + "location": locs[i].text, + "mode":modes[i].text + } + hackathons.append(hackathon) + except Exception as e: + print(f"Error: {e}") + finally: + try: + if driver: + with open("hackathons.json","w") as f: + json.dump(hackathons,f,indent=4); + driver.quit() + print("Webdriver quit successfully") + exit() + except Exception as e: + print(f"Error in final block: {e}") + +while True: scrape_data() - devfolio=Devfolio() - devfolio.findall() - time.sleep(86400) \ No newline at end of file + time.sleep(86400)