Skip to content
Snippets Groups Projects
Commit e5e84c53 authored by Riccardo Boero's avatar Riccardo Boero :innocent:
Browse files

Selected only the countries that matter here.

parent 22013bfe
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,7 @@
./inputs/setup_db.sh
# download and transfrom the data
#python ./inputs/download_transform.py
python ./inputs/download_transform.py
# load vector data into DB
./inputs/load_vector_data.sh
......
......@@ -3,11 +3,15 @@ import geopandas as gpd
from shapely.geometry import shape
from concurrent.futures import ThreadPoolExecutor
# Define the list of locations you're interested in
interested_locations = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'CzechRepublic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Iceland', 'Switzerland', 'Liechtenstein', 'Norway', 'UnitedKingdom', 'Ukraine', 'UnitedStates']
def process_file(row):
df = pd.read_json(row['Url'], lines=True)
df['geometry'] = df['geometry'].apply(shape)
gdf = gpd.GeoDataFrame(df, crs=4326)
gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")
if row['Location'] in interested_locations:
df = pd.read_json(row['Url'], lines=True)
df['geometry'] = df['geometry'].apply(shape)
gdf = gpd.GeoDataFrame(df, crs=4326)
gdf.to_file(f"data/{row['QuadKey']}.geojson", driver="GeoJSON")
def main():
dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
......
......@@ -11,51 +11,61 @@ dir="data"
# Ensure the directory exists
if [ ! -d "$dir" ]; then
echo "Directory not found: $dir"
exit 1
echo "Directory not found: $dir"
exit 1
fi
# Initialize a counter
counter=0
# Maximum number of parallel ogr2ogr instances
max_parallel=20
max_parallel=10
# Process the first file separately
first_file=true
# Array to hold the PIDs of the background processes
declare -a pids
# Loop through all files in the directory
# Total number of files
total_files=$(find "$dir" -type f | wc -l)
processed_files=0
remaining_files=$total_files
for file in "$dir"/*; do
# Skip if not a file
if [ ! -f "$file" ]; then
continue
fi
echo "Processing file n. $counter: $file"
if [ $counter -eq 0 ]; then
# Process the first file
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria &
wait
else
# Process other files in parallel
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append &
fi
# Store the PID of the background process
pids[$counter]=$!
# Increment the counter
((counter++))
# Wait if we have reached the maximum number of parallel instances
if (( counter % max_parallel == 0 )); then
wait "${pids[@]}"
pids=()
fi
# Skip if not a file
if [ ! -f "$file" ]; then
continue
fi
if [ "$first_file" = true ]; then
echo "Processing first file: $file"
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -overwrite -lco engine=Aria
first_file=false
((processed_files++))
((remaining_files--))
echo "Processed files: $processed_files, Remaining: $remaining_files"
else
# Start processing the file in the background
echo "Processing file in parallel: $file"
ogr2ogr -f MySQL MySQL:"$DB_NAME,host=$DB_HOST,user=$DB_USER,password=$DB_PASS" "$file" -nln footprints -update -append &
pids+=($!)
((processed_files++))
((remaining_files--))
echo "Processed files: $processed_files, Remaining: $remaining_files"
# Check if we need to wait for any process to finish
while [ ${#pids[@]} -ge $max_parallel ]; do
# Check each process if it's still running
for i in "${!pids[@]}"; do
if ! kill -0 "${pids[$i]}" 2>/dev/null; then
# Process is finished, remove it from the array
unset 'pids[i]'
fi
done
# Update the array to remove any gaps
pids=("${pids[@]}")
sleep 1 # A short delay to prevent the loop from consuming too much CPU
done
fi
done
# Wait for any remaining background processes to finish
wait
echo "All processing complete."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment