From 7f82aaf589091a49eceec8176f6cdcf6a94a7ae7 Mon Sep 17 00:00:00 2001 From: Riccardo Boero <ribo@nilu.no> Date: Wed, 2 Oct 2024 12:24:49 +0200 Subject: [PATCH] Fixed download --- inputs/download_transform.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/inputs/download_transform.py b/inputs/download_transform.py index eabdbb9..3732054 100644 --- a/inputs/download_transform.py +++ b/inputs/download_transform.py @@ -1,17 +1,29 @@ +import os import pandas as pd import geopandas as gpd from shapely.geometry import shape +from concurrent.futures import ThreadPoolExecutor -def main(): - # Create the directory if it does not exist - os.makedirs('../data', exist_ok=True) - dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv") - for _, row in dataset_links.iterrows(): - df = pd.read_json(row.Url, lines=True) +# Define the list of locations you're interested in +interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates'] + +def process_file(row): + file_path = f"data/{row['QuadKey']}.geojson" + if row['Location'] in interested_locations and not os.path.exists(file_path): + df = pd.read_json(row['Url'], lines=True) df['geometry'] = df['geometry'].apply(shape) gdf = gpd.GeoDataFrame(df, crs=4326) - gdf.to_file(f"../data/{row.QuadKey}.geojson", driver="GeoJSON") + gdf.to_file(file_path, driver="GeoJSON") +def main(): + # Load the dataset links + dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv") + # Create the 'data' directory if it doesn't exist + if not os.path.exists('data'): + os.makedirs('data') + # Use ThreadPoolExecutor to process files in parallel + with ThreadPoolExecutor(max_workers=20) as executor: + executor.map(process_file, dataset_links.to_dict(orient='records')) if __name__ == "__main__": main() -- GitLab