Skip to content
Snippets Groups Projects
Commit e5e84c53 authored by Riccardo Boero's avatar Riccardo Boero :innocent:
Browse files

Selected only the countries that matter here.

parent 22013bfe
No related branches found
No related tags found
No related merge requests found
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
./inputs/setup_db.sh ./inputs/setup_db.sh
# download and transfrom the data # download and transfrom the data
#python ./inputs/download_transform.py python ./inputs/download_transform.py
# load vector data into DB # load vector data into DB
./inputs/load_vector_data.sh ./inputs/load_vector_data.sh
......
...@@ -3,11 +3,15 @@ import geopandas as gpd ...@@ -3,11 +3,15 @@ import geopandas as gpd
from shapely.geometry import shape from shapely.geometry import shape
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
# Define the list of locations you're interested in
interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates']
def process_file(row): def process_file(row):
df = pd.read_json(row['Url'], lines=True) if row['Location'] in interested_locations:
df['geometry'] = df['geometry'].apply(shape) df = pd.read_json(row['Url'], lines=True)
gdf = gpd.GeoDataFrame(df, crs=4326) df['geometry'] = df['geometry'].apply(shape)
gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON") gdf = gpd.GeoDataFrame(df, crs=4326)
gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")
def main(): def main():
dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv") dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
......
...@@ -11,51 +11,61 @@ dir="data" ...@@ -11,51 +11,61 @@ dir="data"
# Ensure the directory exists # Ensure the directory exists
if [ ! -d "$dir" ]; then if [ ! -d "$dir" ]; then
echo "Directory not found: $dir" echo "Directory not found: $dir"
exit 1 exit 1
fi fi
# Initialize a counter
counter=0
# Maximum number of parallel ogr2ogr instances # Maximum number of parallel ogr2ogr instances
max_parallel=20 max_parallel=10
# Process the first file separately
first_file=true
# Array to hold the PIDs of the background processes # Array to hold the PIDs of the background processes
declare -a pids declare -a pids
# Loop through all files in the directory # Total number of files
total_files=$(find "$dir" -type f | wc -l)
processed_files=0
remaining_files=$total_files
for file in "$dir"/*; do for file in "$dir"/*; do
# Skip if not a file # Skip if not a file
if [ ! -f "$file" ]; then if [ ! -f "$file" ]; then
continue continue
fi fi
echo "Processing file n. $counter: $file" if [ "$first_file" = true ]; then
echo "Processing first file: $file"
if [ $counter -eq 0 ]; then ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria
# Process the first file first_file=false
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria & ((processed_files++))
wait ((remaining_files--))
else echo "Processed files: $processed_files, Remaining: $remaining_files"
# Process other files in parallel else
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append & # Start processing the file in the background
fi echo "Processing file in parallel: $file"
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append &
# Store the PID of the background process pids+=($!)
pids[$counter]=$! ((processed_files++))
((remaining_files--))
# Increment the counter echo "Processed files: $processed_files, Remaining: $remaining_files"
((counter++)) # Check if we need to wait for any process to finish
while [ ${#pids[@]} -ge $max_parallel ]; do
# Wait if we have reached the maximum number of parallel instances # Check each process if it's still running
if (( counter % max_parallel == 0 )); then for i in "${!pids[@]}"; do
wait "${pids[@]}" if ! kill -0 "${pids[$i]}" 2>/dev/null; then
pids=() # Process is finished, remove it from the array
fi unset 'pids[i]'
fi
done
# Update the array to remove any gaps
pids=("${pids[@]}")
sleep 1 # A short delay to prevent the loop from consuming too much CPU
done
fi
done done
# Wait for any remaining background processes to finish # Wait for any remaining background processes to finish
wait wait
echo "All processing complete." echo "All processing complete."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment