Skip to content
Snippets Groups Projects
Commit 50b26a9e authored by Riccardo Boero's avatar Riccardo Boero :innocent:
Browse files

FACT_jobs and FACT_geo fully developed and tested.

parent 3815ce9e
No related branches found
No related tags found
1 merge request!1Development
......@@ -2,7 +2,7 @@
julia_version = "1.9.4"
manifest_format = "2.0"
project_hash = "47197433fc52eee5a8ddb16b9468752d5fe2bd5e"
project_hash = "cbb3403901dd39f292beed52b9f397d2f7e700b2"
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
......@@ -372,6 +372,10 @@ deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
......
......@@ -7,3 +7,4 @@ version = "0.1.0"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MySQL = "39abe10b-433b-5dbd-92d4-e302a9df00cd"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
......@@ -64,7 +64,12 @@ function nm_counties(conn::MySQL.Connection)
dftemp[!, :region] = fill("US", nrow(dftemp))
return dftemp
end
function no_counties(conn::MySQL.Connection)
dftemp = query_connection(conn, "SELECT nuts_id as geo_id, ST_AsText(SHAPE) as shape_obj, ST_Area(SHAPE) as area FROM eu_provinces WHERE cntr_code = 'NO';")
dftemp[!, :region] = fill("EU", nrow(dftemp))
return dftemp
end
function get_geo_objects(scale::String; host::String="127.0.0.1")
"""
table_dict
......@@ -96,7 +101,9 @@ function get_geo_objects(scale::String; host::String="127.0.0.1")
"us_blocks" => us_blocks,
"us_counties" => us_counties,
"us_states" => us_states,
"us_tracts" => us_tracts
"us_tracts" => us_tracts,
"nm_counties" => nm_counties,
"no_counties" => no_counties
)
"""
......@@ -127,7 +134,7 @@ function get_geo_objects(scale::String; host::String="127.0.0.1")
)
# data retrieval
df = get_all_objects_from_table(scale, df_template, table_dict, "FACT_geo"; host=host)
df = get_all_objects_from_table(scale, df_template, table_dict, "FACT_geo"; host=host, port=3307)
return df
end
......@@ -9,39 +9,127 @@ function us_lodes(conn::MySQL.Connection, selection::Dict)
dftemp = DataFrame()
for id in geo_id
if first_region_id_length == 15
query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID = '"*id*"';"
elseif first_region_id_length < 15 & first_region_id_length > 0
query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID LIKE '"*id*"%';"
query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31_33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44_45, sum(CNS08) as jobs_48_49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
elseif first_region_id_length < 15 && first_region_id_length > 0
query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31_33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44_45, sum(CNS08) as jobs_48_49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*string(year)*" AND GeoID LIKE '"*id*"%';"
end
result = query_connection(conn, query)
result[!, :geo_id] = fill(id, nrow(result))
dftemp = vcat(dftemp, result)
long_df = stack(result, names(result), variable_name=:industry, value_name=:jobs)
long_df.industry = replace.(string.(long_df.industry), "jobs_" => "")
long_df[!, :geo_id] = fill(id, nrow(long_df))
long_df[!, :agg_level] = fill(2, nrow(long_df))
dftemp = vcat(dftemp, long_df)
end
dftemp[!, :region] = fill("US", nrow(dftemp))
return dftemp
end
function us_qcew(conn::MySQL.Connection, selection::Dict)
#GeoID: us state county
#Naics: depending on
#Agglvl_code: 14 National, by NAICS Sector; 15 National, by NAICS 3-digit; 16 National, by NAICS 4-digit; 17 National, by NAICS 5-digit; 18 National, by NAICS 6-digit; 54 Statewide, NAICS Sector; 55 Statewide, NAICS 3-digit; 56 Statewide, NAICS 4-digit; 57 Statewide, NAICS 5-digit; 58 Statewide, NAICS 6-digit; 74 County, NAICS Sector; 75 County, NAICS 3-digit; 76 County, NAICS 4-digit; 77 County, NAICS 5-digit; 78 County, NAICS 6-digit
# getting valuable info from dictionary
year = selection["year"]
geo_id = selection["geo_id"]
agg_level = selection["agg_level"]
# object to be returned
dftemp = DataFrame()
# iterate over geo objects
for id in geo_id
# check what spatial level
if parse(Int, id) % 1000 == 0
if id[1:2] == "US"
agglvl_code = 10
else
agglvl_code = 50
end
else
agglvl_code = 70
end
# infer Agglvl_code -- because of Agglvl_code codification:
agglvl_code += (agg_level+2)
# prepare query
query = "SELECT Naics as industry, (Jan_jobs+Feb_jobs+Mar_jobs+Apr_jobs+May_jobs+Jun_jobs+Jul_jobs+Aug_jobs+Sep_jobs+Oct_jobs+Nov_jobs+Dec_jobs)/12 as jobs FROM QCEW WHERE Year = "*string(year)*" AND GeoID = '"*id*"' AND Agglvl_code = "*string(agglvl_code)*";"
# execute
result = query_connection(conn, query)
# add geo_id col
result[!, :geo_id] = fill(id, nrow(result))
# add agg level
result[!, :agg_level] = fill(agg_level, nrow(result))
# copy results to be returned
dftemp = vcat(dftemp, result)
end
dftemp[!, :region] = fill("US", nrow(dftemp))
return dftemp
end
function eu_lfs(conn::MySQL.Connection, selection::Dict)
# getting valuable info from dictionary
year = selection["year"]
geo_id = selection["geo_id"]
# object to be returned
dftemp = DataFrame()
# iterate over geo objects
for id in geo_id
query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID = '"*id*"';"
# prepare query
query = "SELECT Nace as industry, ((EmpTh_Q1+EmpTh_Q2+EmpTh_Q3+EmpTh_Q4)/4)*1000 as jobs FROM LFS WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
# execute
result = query_connection(conn, query)
# add geo_id col
result[!, :geo_id] = fill(id, nrow(result))
# add agg level
result[!, :agg_level] = fill(2, nrow(result))
# copy results to be returned
dftemp = vcat(dftemp, result)
end
dftemp[!, :region] = fill("EU", nrow(dftemp))
return dftemp
end
function eu_sbs(conn::MySQL.Connection, selection::Dict)
# getting valuable info from dictionary
year = selection["year"]
geo_id = selection["geo_id"]
# object to be returned
dftemp = DataFrame()
# iterate over geo objects
for id in geo_id
# prepare query
query = "SELECT Nace as industry, Employment as jobs FROM SBS WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
# execute
result = query_connection(conn, query)
# add geo_id col
result[!, :geo_id] = fill(id, nrow(result))
# add agg level
result[!, :agg_level] = fill(4, nrow(result))
# copy results to be returned
dftemp = vcat(dftemp, result)
end
dftemp[!, :region] = fill("EU", nrow(dftemp))
return dftemp
end
function eu_rea(conn::MySQL.Connection, selection::Dict)
# getting valuable info from dictionary
year = selection["year"]
geo_id = selection["geo_id"]
# object to be returned
dftemp = DataFrame()
# iterate over geo objects
for id in geo_id
# prepare query
query = "SELECT Nace as industry, EmpTh*1000 as jobs FROM REA WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
# execute
result = query_connection(conn, query)
result[!, :geo_id] = fill(id, nrow(result))
# add geo_id col
result[!, :geo_id] = fill(id, nrow(result))
# add agg level
result[!, :agg_level] = fill(1, nrow(result))
# copy results to be returned
dftemp = vcat(dftemp, result)
end
dftemp[!, :region] = fill("US", nrow(dftemp))
dftemp[!, :region] = fill("EU", nrow(dftemp))
return dftemp
end
function get_annual_jobs(table::String, selection::Dict; host::String="127.0.0.1")
table_dict = Dict(
"us_lodes" => us_lodes,
......@@ -54,11 +142,13 @@ function get_annual_jobs(table::String, selection::Dict; host::String="127.0.0.1
df_template = DataFrame(
region = String[], # column to store if EU or US
geo_id = String[], # Column for Geo IDs
agg_level = Int8[], # Level of industrial classification
industry = String[], # industry identifier
jobs = Float64[] # column for average number of annual jobs
)
# data retrieval
df = get_selected_objects_from_table(table, df_template, table_dict, "FACT_jobs", selection; host=host)
df = get_selected_objects_from_table(table, df_template, table_dict, "FACT_jobs", selection; host=host, port=3308)
return df
end
......
......@@ -98,12 +98,12 @@ specific tables or actions. For each identified table, the function fetches data
- It performs type checking and conversion to ensure that the data types of the columns in the result match those in the original `df`.
- Additional data type conversions should be added as necessary for other column types.
"""
function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Dict, database::String; host::String="127.0.0.1")
function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Dict, database::String; host::String="127.0.0.1", port::Int=3306)
# Interpret the scale using an external function
interpreted_scale = interpret_scale(scale)
# Establish the connection
conn = establish_connection(host, "root", "devops", database; port=3307)
conn = establish_connection(host, "root", "devops", database; port=port)
for table in interpreted_scale
result = handle_simple_action(String(table), table_dict, conn)
......@@ -119,8 +119,6 @@ function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Di
result[!, col] = string.(result[:, col])
elseif df_col_type == Float64
result[!, col] = [tryparse(Float64, x) for x in result[:, col]]
#else
# Add more type conversions as needed
end
end
end
......@@ -144,12 +142,13 @@ function handle_selection_action(table::String, table_dict::Dict, conn::MySQL.Co
return func(conn, selection)
end
function get_selected_objects_from_table(table::String, df::DataFrame, table_dict::Dict, database::String, selection::Dict; host::String="127.0.0.1")
function get_selected_objects_from_table(table::String, df::DataFrame, table_dict::Dict, database::String, selection::Dict; host::String="127.0.0.1", port::Int=3306)
# Establish the connection
conn = establish_connection(host, "root", "devops", database; port=3307)
conn = establish_connection(host, "root", "devops", database; port=port)
# get results
result = handle_selection_action(String(table), table_dict, conn, selection)
# start converting data types to fix mistakes in SQL types interpretation
result = result[:, names(df)]
# Check and convert each column in result to match the type in df
for col in names(df)
......@@ -162,8 +161,6 @@ function get_selected_objects_from_table(table::String, df::DataFrame, table_dic
result[!, col] = string.(result[:, col])
elseif df_col_type == Float64
result[!, col] = [tryparse(Float64, x) for x in result[:, col]]
#else
# Add more type conversions as needed
end
end
end
......
@testset "FACTDataREader FACT_geo test" begin
df = get_geo_objects("nm_counties")
# Test if the result is a DataFrame
@test df isa DataFrame
# Test for expected columns and right order
@test names(df) == ["region", "geo_id", "shape_obj", "area"]
# Test for expected number of rows/columns
@test size(df, 1) == 33
@test size(df, 2) == 4
end
using DataFrames
@testset "FACTDataREader FACT_geo test" begin
df = get_geo_objects("nm_counties")
# Test if the result is a DataFrame
@test df isa DataFrame
# Test for expected columns
@test all(ismember.([:region, :geo_id, :shape_obj, :area], names(df)))
# Test for expected number of rows/columns
@test size(df, 1) == 33
@test size(df, 2) == 4
end
@testset "FACTDataREader FACT_jobs test" begin
df1 = get_geo_objects("nm_counties")
resize!(df1, 3)
year = 2018
agg_lvl = 2
selection_dict = Dict(
"year" => year,
"geo_id" => df1[!, :geo_id]
"geo_id" => df1[!, :geo_id],
"agg_level" => agg_lvl
)
df_lodes = get_annual_jobs("LODES8", selection_dict)
# TEST of LODES
df_lodes = get_annual_jobs("us_lodes", selection_dict)
# Test if the result is a DataFrame
@test df_lodes isa DataFrame
# Test for expected columns
#@test all(ismember.([:region, :geo_id, :shape_obj, :area], names(df_lodes)))
# Test for expected number of rows/columns
@test size(df_lodes, 1) == 33
@test size(df_lodes, 2) == 21
end
\ No newline at end of file
@test size(df_lodes, 1) == 60
@test size(df_lodes, 2) == 5
# TEST of QCEW
df_qcew = get_annual_jobs("us_qcew", selection_dict)
# Test if the result is a DataFrame
@test df_qcew isa DataFrame
# Test for expected number of rows/columns
@test size(df_qcew, 1) == 57
@test size(df_qcew, 2) == 5
end
@testset "FACTDataREader FACT_geo test" begin
df = get_geo_objects("no_counties")
# Test if the result is a DataFrame
@test df isa DataFrame
# Test for expected columns and right order
@test names(df) == ["region", "geo_id", "shape_obj", "area"]
# Test for expected number of rows/columns
@test size(df, 1) == 13
@test size(df, 2) == 4
end
@testset "FACTDataREader FACT_jobs test" begin
df1 = get_geo_objects("no_counties")
resize!(df1, 3)
year = 2018
agg_lvl = 2
selection_dict = Dict(
"year" => year,
"geo_id" => df1[!, :geo_id],
"agg_level" => agg_lvl
)
# TEST of REA
df_rea = get_annual_jobs("eu_rea", selection_dict)
# Test if the result is a DataFrame
@test df_rea isa DataFrame
# Test for expected number of rows/columns
@test size(df_rea, 1) == 39
@test size(df_rea, 2) == 5
# TEST of LFS
selection_dict_NO = Dict(
"year" => year,
"geo_id" => ["NO"]
)
df_lfs = get_annual_jobs("eu_lfs", selection_dict_NO)
# Test if the result is a DataFrame
@test df_lfs isa DataFrame
# Test for expected number of rows/columns
@test size(df_lfs, 1) == 85
@test size(df_lfs, 2) == 5
# TEST of SBS
selection_dict_NO_21 = Dict(
"year" => 2021,
"geo_id" => ["NO"]
)
df_sbs = get_annual_jobs("eu_sbs", selection_dict_NO_21)
# Test if the result is a DataFrame
@test df_lfs isa DataFrame
# Test for expected number of rows/columns
@test size(df_lfs, 1) == 85
@test size(df_lfs, 2) == 5
end
using Test
using DataFrames
using FACTDataReader
@testset "FACTDataREader Tests" begin
include("nm_test.jl")
@testset "FACTDataREader New Mexico Tests" begin
#include("nm_test/FACT_geo.jl")
#include("nm_test/FACT_jobs.jl")
end
@testset "FACTDataREader Norway Tests" begin
#include("no_test/FACT_geo.jl")
#include("no_test/FACT_jobs.jl")
end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment