From fcac8d4155a4d23a8807d09efb29b14d900611fd Mon Sep 17 00:00:00 2001
From: Riccardo Boero <ribo@nilu.no>
Date: Thu, 11 Jan 2024 09:05:18 +0100
Subject: [PATCH] Added download checks to avoid redownloads of files.

---
 inputs/download_transform.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/inputs/download_transform.py b/inputs/download_transform.py
index 5313637..49a5d0b 100644
--- a/inputs/download_transform.py
+++ b/inputs/download_transform.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 import geopandas as gpd
 from shapely.geometry import shape
@@ -7,15 +8,19 @@ from concurrent.futures import ThreadPoolExecutor
 interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates']
 
 def process_file(row):
-    if row['Location'] in interested_locations:
+    file_path = f"data/{row['QuadKey']}.geojson"
+    if row['Location'] in interested_locations and not os.path.exists(file_path):
         df = pd.read_json(row['Url'], lines=True)
         df['geometry'] = df['geometry'].apply(shape)
         gdf = gpd.GeoDataFrame(df, crs=4326)
-        gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")
+        gdf.to_file(file_path, driver="GeoJSON")
 
 def main():
     dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
 
+    if not os.path.exists('data'):
+        os.makedirs('data')
+
     # Use ThreadPoolExecutor to process files in parallel
     with ThreadPoolExecutor(max_workers=20) as executor:
         executor.map(process_file, dataset_links.to_dict(orient='records'))
-- 
GitLab