diff --git a/gen_data.sh b/gen_data.sh index 5d253bfb4b058f7386a8baa03b1d0d47713b5579..9fc0b42f09e27ae63524e831c30165c3ec5ab97c 100755 --- a/gen_data.sh +++ b/gen_data.sh @@ -4,7 +4,7 @@ ./inputs/setup_db.sh # download and transfrom the data -#python ./inputs/download_transform.py +python ./inputs/download_transform.py # load vector data into DB ./inputs/load_vector_data.sh diff --git a/inputs/download_transform.py b/inputs/download_transform.py index d5b752a7459c5796fd2f090bf6edbc92312b06df..531363776ba94b9decea3afa8b6c4504e93ac7fc 100644 --- a/inputs/download_transform.py +++ b/inputs/download_transform.py @@ -3,11 +3,15 @@ import geopandas as gpd from shapely.geometry import shape from concurrent.futures import ThreadPoolExecutor +# Define the list of locations you're interested in +interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates'] + def process_file(row): - df = pd.read_json(row['Url'], lines=True) - df['geometry'] = df['geometry'].apply(shape) - gdf = gpd.GeoDataFrame(df, crs=4326) - gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON") + if row['Location'] in interested_locations: + df = pd.read_json(row['Url'], lines=True) + df['geometry'] = df['geometry'].apply(shape) + gdf = gpd.GeoDataFrame(df, crs=4326) + gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON") def main(): dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv") diff --git a/inputs/load_vector_data.sh b/inputs/load_vector_data.sh index 9151fca7968415fa365345c75d30f8bfbdc71c98..400ff8171b9d057a70d33f86afe06f156933a556 100755 --- a/inputs/load_vector_data.sh +++ b/inputs/load_vector_data.sh @@ -11,51 +11,61 @@ dir="data" # Ensure the directory exists if [ ! -d "$dir" ]; then - echo "Directory not found: $dir" - exit 1 + echo "Directory not found: $dir" + exit 1 fi -# Initialize a counter -counter=0 - # Maximum number of parallel ogr2ogr instances -max_parallel=20 +max_parallel=10 + +# Process the first file separately +first_file=true # Array to hold the PIDs of the background processes declare -a pids -# Loop through all files in the directory +# Total number of files +total_files=$(find "$dir" -type f | wc -l) +processed_files=0 +remaining_files=$total_files + for file in "$dir"/*; do - # Skip if not a file - if [ ! -f "$file" ]; then - continue - fi - - echo "Processing file n. $counter: $file" - - if [ $counter -eq 0 ]; then - # Process the first file - ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria & - wait - else - # Process other files in parallel - ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append & - fi - - # Store the PID of the background process - pids[$counter]=$! - - # Increment the counter - ((counter++)) - - # Wait if we have reached the maximum number of parallel instances - if (( counter % max_parallel == 0 )); then - wait "${pids[@]}" - pids=() - fi + # Skip if not a file + if [ ! -f "$file" ]; then + continue + fi + + if [ "$first_file" = true ]; then + echo "Processing first file: $file" + ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria + first_file=false + ((processed_files++)) + ((remaining_files--)) + echo "Processed files: $processed_files, Remaining: $remaining_files" + else + # Start processing the file in the background + echo "Processing file in parallel: $file" + ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append & + pids+=($!) + ((processed_files++)) + ((remaining_files--)) + echo "Processed files: $processed_files, Remaining: $remaining_files" + # Check if we need to wait for any process to finish + while [ ${#pids[@]} -ge $max_parallel ]; do + # Check each process if it's still running + for i in "${!pids[@]}"; do + if ! kill -0 "${pids[$i]}" 2>/dev/null; then + # Process is finished, remove it from the array + unset 'pids[i]' + fi + done + # Update the array to remove any gaps + pids=("${pids[@]}") + sleep 1 # A short delay to prevent the loop from consuming too much CPU + done + fi done # Wait for any remaining background processes to finish wait - echo "All processing complete."