Selected only the countries that matter here.

e5e84c53 · Riccardo Boero · 22013bfe · e5e84c53 · e5e84c53 · e5e84c53
Commit e5e84c53 authored 1 year ago by Riccardo Boero
--- a/gen_data.sh
+++ b/gen_data.sh
@@ -4,7 +4,7 @@
 ./inputs/setup_db.sh

 # download and transfrom the data
-#python ./inputs/download_transform.py
+python ./inputs/download_transform.py

 # load vector data into DB
 ./inputs/load_vector_data.sh

--- a/inputs/download_transform.py
+++ b/inputs/download_transform.py
@@ -3,11 +3,15 @@ import geopandas as gpd
 from shapely.geometry import shape
 from concurrent.futures import ThreadPoolExecutor

+# Define the list of locations you're interested in
+interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates']
+
 def process_file(row):
-    df = pd.read_json(row['Url'], lines=True)
-    df['geometry'] = df['geometry'].apply(shape)
-    gdf = gpd.GeoDataFrame(df, crs=4326)
-    gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")
+    if row['Location'] in interested_locations:
+        df = pd.read_json(row['Url'], lines=True)
+        df['geometry'] = df['geometry'].apply(shape)
+        gdf = gpd.GeoDataFrame(df, crs=4326)
+        gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")

 def main():
    dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")

--- a/inputs/load_vector_data.sh
+++ b/inputs/load_vector_data.sh
@@ -11,51 +11,61 @@ dir="data"

 # Ensure the directory exists
 if [ ! -d "$dir" ]; then
-    echo "Directory not found: $dir"
-    exit 1
+	    echo "Directory not found: $dir"
+	        exit 1
 fi

-# Initialize a counter
-counter=0
-
 # Maximum number of parallel ogr2ogr instances
-max_parallel=20
+max_parallel=10
+
+# Process the first file separately
+first_file=true

 # Array to hold the PIDs of the background processes
 declare -a pids

-# Loop through all files in the directory
+# Total number of files
+total_files=$(find "$dir" -type f | wc -l)
+processed_files=0
+remaining_files=$total_files
+
 for file in "$dir"/*; do
-    # Skip if not a file
-    if [ ! -f "$file" ]; then
-        continue
-    fi
-
-    echo "Processing file n. $counter: $file"
-
-    if [ $counter -eq 0 ]; then
-        # Process the first file
-        ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria &
-    	wait
-    else
-        # Process other files in parallel
-        ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append &
-    fi
-
-    # Store the PID of the background process
-    pids[$counter]=$!
-
-    # Increment the counter
-    ((counter++))
-
-    # Wait if we have reached the maximum number of parallel instances
-    if (( counter % max_parallel == 0 )); then
-        wait "${pids[@]}"
-        pids=()
-    fi
+  # Skip if not a file
+  if [ ! -f "$file" ]; then
+  	continue
+  fi
+  
+  if [ "$first_file" = true ]; then
+	echo "Processing first file: $file"
+	ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria
+	first_file=false
+	((processed_files++))
+	((remaining_files--))
+	echo "Processed files: $processed_files, Remaining: $remaining_files"
+  else
+	# Start processing the file in the background
+	echo "Processing file in parallel: $file"
+	ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append &
+	pids+=($!)	
+	((processed_files++))
+	((remaining_files--))
+	echo "Processed files: $processed_files, Remaining: $remaining_files"
+	# Check if we need to wait for any process to finish
+	while [ ${#pids[@]} -ge $max_parallel ]; do
+		# Check each process if it's still running
+		for i in "${!pids[@]}"; do
+			if ! kill -0 "${pids[$i]}" 2>/dev/null; then
+				# Process is finished, remove it from the array
+				unset 'pids[i]'
+			fi
+		done
+		# Update the array to remove any gaps
+		pids=("${pids[@]}")
+		sleep 1 # A short delay to prevent the loop from consuming too much CPU
+	done
+  fi
 done

 # Wait for any remaining background processes to finish
 wait
-
 echo "All processing complete."