FACT_jobs and FACT_geo fully developed and tested.

50b26a9e · Riccardo Boero · 3815ce9e · 50b26a9e · 50b26a9e · 50b26a9e
Commit 50b26a9e authored 1 year ago by Riccardo Boero
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@

 julia_version = "1.9.4"
 manifest_format = "2.0"
-project_hash = "47197433fc52eee5a8ddb16b9468752d5fe2bd5e"
+project_hash = "cbb3403901dd39f292beed52b9f397d2f7e700b2"

 [[deps.ArgTools]]
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
@@ -372,6 +372,10 @@ deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 version = "1.10.0"

+[[deps.Test]]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
 [[deps.UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

--- a/Project.toml
+++ b/Project.toml
@@ -7,3 +7,4 @@ version = "0.1.0"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MySQL = "39abe10b-433b-5dbd-92d4-e302a9df00cd"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
--- a/src/FACT_geo.jl
+++ b/src/FACT_geo.jl
@@ -64,7 +64,12 @@ function nm_counties(conn::MySQL.Connection)
    dftemp[!, :region] = fill("US", nrow(dftemp))
    return dftemp
 end
-
+function no_counties(conn::MySQL.Connection)
+    dftemp = query_connection(conn, "SELECT nuts_id as geo_id, ST_AsText(SHAPE) as shape_obj, ST_Area(SHAPE) as area FROM eu_provinces WHERE cntr_code = 'NO';")
+    dftemp[!, :region] = fill("EU", nrow(dftemp))
+    return dftemp
+end
+ 
 function get_geo_objects(scale::String; host::String="127.0.0.1")
    """
    table_dict
@@ -96,7 +101,9 @@ function get_geo_objects(scale::String; host::String="127.0.0.1")
        "us_blocks" => us_blocks,
        "us_counties" => us_counties,
        "us_states" => us_states,
-        "us_tracts" => us_tracts
+        "us_tracts" => us_tracts,
+        "nm_counties" => nm_counties,
+        "no_counties" => no_counties
    )

    """
@@ -127,7 +134,7 @@ function get_geo_objects(scale::String; host::String="127.0.0.1")
    )

    # data retrieval
-    df = get_all_objects_from_table(scale, df_template, table_dict, "FACT_geo"; host=host) 
+    df = get_all_objects_from_table(scale, df_template, table_dict, "FACT_geo"; host=host, port=3307) 

    return df
 end
--- a/src/FACT_jobs.jl
+++ b/src/FACT_jobs.jl
@@ -9,39 +9,127 @@ function us_lodes(conn::MySQL.Connection, selection::Dict)
    dftemp = DataFrame()
    for id in geo_id
        if first_region_id_length == 15
-            query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID = '"*id*"';"
-        elseif first_region_id_length < 15 & first_region_id_length > 0
-            query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID LIKE '"*id*"%';"
+            query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31_33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44_45, sum(CNS08) as jobs_48_49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
+        elseif first_region_id_length < 15 && first_region_id_length > 0
+            query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31_33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44_45, sum(CNS08) as jobs_48_49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*string(year)*" AND GeoID LIKE '"*id*"%';"
        end
        result = query_connection(conn, query)
-        result[!, :geo_id] = fill(id, nrow(result))    
-        dftemp = vcat(dftemp, result)
+        long_df = stack(result, names(result), variable_name=:industry, value_name=:jobs)
+        long_df.industry = replace.(string.(long_df.industry), "jobs_" => "")
+        long_df[!, :geo_id] = fill(id, nrow(long_df))
+        long_df[!, :agg_level] = fill(2, nrow(long_df)) 
+        dftemp = vcat(dftemp, long_df)
    end 
    dftemp[!, :region] = fill("US", nrow(dftemp))
    return dftemp
 end

 function us_qcew(conn::MySQL.Connection, selection::Dict)
-    #GeoID: us state county
-    #Naics: depending on 
-    #Agglvl_code: 14 	National, by NAICS Sector; 15 	National, by NAICS 3-digit; 16 	National, by NAICS 4-digit; 17 	National, by NAICS 5-digit; 18 	National, by NAICS 6-digit; 54 	Statewide, NAICS Sector; 55 	Statewide, NAICS 3-digit; 56 	Statewide, NAICS 4-digit; 57 	Statewide, NAICS 5-digit; 58 	Statewide, NAICS 6-digit; 74 	County, NAICS Sector; 75 	County, NAICS 3-digit; 76 	County, NAICS 4-digit; 77 	County, NAICS 5-digit; 78 	County, NAICS 6-digit
-
+    # getting valuable info from dictionary
    year = selection["year"]
    geo_id = selection["geo_id"]
-
+    agg_level = selection["agg_level"]
+    # object to be returned
    dftemp = DataFrame()
+    # iterate over geo objects
+    for id in geo_id
+        # check what spatial level
+        if parse(Int, id) % 1000 == 0
+            if id[1:2] == "US"
+                agglvl_code = 10
+            else
+                agglvl_code = 50
+            end
+        else
+            agglvl_code = 70
+        end
+        # infer Agglvl_code -- because of Agglvl_code codification:
+        agglvl_code += (agg_level+2)
+        # prepare query
+        query = "SELECT Naics as industry, (Jan_jobs+Feb_jobs+Mar_jobs+Apr_jobs+May_jobs+Jun_jobs+Jul_jobs+Aug_jobs+Sep_jobs+Oct_jobs+Nov_jobs+Dec_jobs)/12 as jobs FROM QCEW WHERE Year = "*string(year)*" AND GeoID = '"*id*"' AND Agglvl_code = "*string(agglvl_code)*";"
+        # execute
+        result = query_connection(conn, query)
+        # add geo_id col
+        result[!, :geo_id] = fill(id, nrow(result))
+        # add agg level
+        result[!, :agg_level] = fill(agg_level, nrow(result))
+        # copy results to be returned
+        dftemp = vcat(dftemp, result)
+    end 
+    dftemp[!, :region] = fill("US", nrow(dftemp))
+    return dftemp
+end

+function eu_lfs(conn::MySQL.Connection, selection::Dict)
+    # getting valuable info from dictionary
+    year = selection["year"]
+    geo_id = selection["geo_id"]
+    # object to be returned
+    dftemp = DataFrame()
+    # iterate over geo objects
    for id in geo_id
-        query = "SELECT sum(CNS01) as jobs_11, sum(CNS02) as jobs_21, sum(CNS03) as jobs_22, sum(CNS04) as jobs_23, sum(CNS05) as jobs_31-33, sum(CNS06) as jobs_42, sum(CNS07) as jobs_44-45, sum(CNS08) as jobs_48-49, sum(CNS09) as jobs_51, sum(CNS10) as jobs_52, sum(CNS11) as jobs_53, sum(CNS12) as jobs_54, sum(CNS13) as jobs_55, sum(CNS14) as jobs_56, sum(CNS15) as jobs_61, sum(CNS16) as jobs_62, sum(CNS17) as jobs_71, sum(CNS18) as jobs_72, sum(CNS19) as jobs_81, sum(CNS20) as jobs_92 FROM LODES8 WHERE Year = "*year*" AND GeoID = '"*id*"';"
+        # prepare query
+        query = "SELECT Nace as industry, ((EmpTh_Q1+EmpTh_Q2+EmpTh_Q3+EmpTh_Q4)/4)*1000 as jobs FROM LFS WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
+        # execute
+        result = query_connection(conn, query)
+        # add geo_id col
+        result[!, :geo_id] = fill(id, nrow(result))
+        # add agg level
+        result[!, :agg_level] = fill(2, nrow(result))
+        # copy results to be returned
+        dftemp = vcat(dftemp, result)
+    end 
+    dftemp[!, :region] = fill("EU", nrow(dftemp))
+    return dftemp
+end
+
+function eu_sbs(conn::MySQL.Connection, selection::Dict)
+        # getting valuable info from dictionary
+        year = selection["year"]
+        geo_id = selection["geo_id"]
+        # object to be returned
+        dftemp = DataFrame()
+        # iterate over geo objects
+        for id in geo_id
+            # prepare query
+            query = "SELECT Nace as industry, Employment as jobs FROM SBS WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
+            # execute
+            result = query_connection(conn, query)
+            # add geo_id col
+            result[!, :geo_id] = fill(id, nrow(result))
+            # add agg level
+            result[!, :agg_level] = fill(4, nrow(result))
+            # copy results to be returned
+            dftemp = vcat(dftemp, result)
+        end 
+        dftemp[!, :region] = fill("EU", nrow(dftemp))
+        return dftemp
+end

+function eu_rea(conn::MySQL.Connection, selection::Dict)
+    # getting valuable info from dictionary
+    year = selection["year"]
+    geo_id = selection["geo_id"]
+    # object to be returned
+    dftemp = DataFrame()
+    # iterate over geo objects
+    for id in geo_id
+        # prepare query
+        query = "SELECT Nace as industry, EmpTh*1000 as jobs FROM REA WHERE Year = "*string(year)*" AND GeoID = '"*id*"';"
+        # execute
        result = query_connection(conn, query)
-        result[!, :geo_id] = fill(id, nrow(result))    
+        # add geo_id col
+        result[!, :geo_id] = fill(id, nrow(result))
+        # add agg level
+        result[!, :agg_level] = fill(1, nrow(result))
+        # copy results to be returned
        dftemp = vcat(dftemp, result)
    end 
-    dftemp[!, :region] = fill("US", nrow(dftemp))
+    dftemp[!, :region] = fill("EU", nrow(dftemp))
    return dftemp
 end

+
 function get_annual_jobs(table::String, selection::Dict; host::String="127.0.0.1")
    table_dict = Dict(
        "us_lodes" => us_lodes,
@@ -54,11 +142,13 @@ function get_annual_jobs(table::String, selection::Dict; host::String="127.0.0.1
    df_template = DataFrame(
        region = String[], # column to store if EU or US
        geo_id = String[],  # Column for Geo IDs
+        agg_level = Int8[], # Level of industrial classification
+        industry = String[], # industry identifier
        jobs = Float64[] # column for average number of annual jobs
    )

    # data retrieval
-    df = get_selected_objects_from_table(table, df_template, table_dict, "FACT_jobs", selection; host=host) 
+    df = get_selected_objects_from_table(table, df_template, table_dict, "FACT_jobs", selection; host=host, port=3308) 

    return df
 end

--- a/src/utils_DB_query.jl
+++ b/src/utils_DB_query.jl
@@ -98,12 +98,12 @@ specific tables or actions. For each identified table, the function fetches data
 - It performs type checking and conversion to ensure that the data types of the columns in the result match those in the original `df`.
 - Additional data type conversions should be added as necessary for other column types.
 """
-function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Dict, database::String; host::String="127.0.0.1")
+function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Dict, database::String; host::String="127.0.0.1", port::Int=3306)
    # Interpret the scale using an external function
    interpreted_scale = interpret_scale(scale)
    
    # Establish the connection
-    conn = establish_connection(host, "root", "devops", database; port=3307)
+    conn = establish_connection(host, "root", "devops", database; port=port)
    
    for table in interpreted_scale
        result = handle_simple_action(String(table), table_dict, conn)
@@ -119,8 +119,6 @@ function get_all_objects_from_table(scale::String, df::DataFrame, table_dict::Di
                    result[!, col] = string.(result[:, col])
                elseif df_col_type == Float64
                    result[!, col] = [tryparse(Float64, x) for x in result[:, col]]
-                #else
-                    # Add more type conversions as needed
                end
            end
        end
@@ -144,12 +142,13 @@ function handle_selection_action(table::String, table_dict::Dict, conn::MySQL.Co
    return func(conn, selection)
 end

-function get_selected_objects_from_table(table::String, df::DataFrame, table_dict::Dict, database::String, selection::Dict; host::String="127.0.0.1")
+function get_selected_objects_from_table(table::String, df::DataFrame, table_dict::Dict, database::String, selection::Dict; host::String="127.0.0.1", port::Int=3306)
    
    # Establish the connection
-    conn = establish_connection(host, "root", "devops", database; port=3307)
-
+    conn = establish_connection(host, "root", "devops", database; port=port)
+    # get results
    result = handle_selection_action(String(table), table_dict, conn, selection)
+    # start converting data types to fix mistakes in SQL types interpretation
    result = result[:, names(df)]
    # Check and convert each column in result to match the type in df
    for col in names(df)
@@ -162,8 +161,6 @@ function get_selected_objects_from_table(table::String, df::DataFrame, table_dic
                result[!, col] = string.(result[:, col])
            elseif df_col_type == Float64
                result[!, col] = [tryparse(Float64, x) for x in result[:, col]]
-            #else
-                # Add more type conversions as needed
            end
        end
    end

--- a/test/nm_test/FACT_geo.jl
+++ b/test/nm_test/FACT_geo.jl
+@testset "FACTDataREader FACT_geo test" begin
+    df = get_geo_objects("nm_counties")
+    # Test if the result is a DataFrame
+    @test df isa DataFrame
+    # Test for expected columns and right order
+    @test names(df) == ["region", "geo_id", "shape_obj", "area"]
+    # Test for expected number of rows/columns
+    @test size(df, 1) == 33
+    @test size(df, 2) == 4
+end
--- a/test/nm_test.jl
+++ b/test/nm_test.jl
-using DataFrames
-
-@testset "FACTDataREader FACT_geo test" begin
-    df = get_geo_objects("nm_counties")
-    # Test if the result is a DataFrame
-    @test df isa DataFrame
-    # Test for expected columns
-    @test all(ismember.([:region, :geo_id, :shape_obj, :area], names(df)))
-    # Test for expected number of rows/columns
-    @test size(df, 1) == 33
-    @test size(df, 2) == 4
-end
-
 @testset "FACTDataREader FACT_jobs test" begin
    df1 = get_geo_objects("nm_counties")
+    resize!(df1, 3)
    year = 2018
+    agg_lvl = 2
    selection_dict = Dict(
        "year" => year,
-        "geo_id" => df1[!, :geo_id]
+        "geo_id" => df1[!, :geo_id],
+        "agg_level" => agg_lvl
    )
-    df_lodes = get_annual_jobs("LODES8", selection_dict)
+    # TEST of LODES
+    df_lodes = get_annual_jobs("us_lodes", selection_dict)
    # Test if the result is a DataFrame
    @test df_lodes isa DataFrame
-    # Test for expected columns
-    #@test all(ismember.([:region, :geo_id, :shape_obj, :area], names(df_lodes)))
    # Test for expected number of rows/columns
-    @test size(df_lodes, 1) == 33
-    @test size(df_lodes, 2) == 21
-end
\ No newline at end of file
+    @test size(df_lodes, 1) == 60
+    @test size(df_lodes, 2) == 5
+    # TEST of QCEW
+    df_qcew = get_annual_jobs("us_qcew", selection_dict)
+    # Test if the result is a DataFrame
+    @test df_qcew isa DataFrame
+    # Test for expected number of rows/columns
+    @test size(df_qcew, 1) == 57
+    @test size(df_qcew, 2) == 5
+    
+end
--- a/test/no_test/FACT_geo.jl
+++ b/test/no_test/FACT_geo.jl
+@testset "FACTDataREader FACT_geo test" begin
+    df = get_geo_objects("no_counties")
+    # Test if the result is a DataFrame
+    @test df isa DataFrame
+    # Test for expected columns and right order
+    @test names(df) == ["region", "geo_id", "shape_obj", "area"]
+    # Test for expected number of rows/columns
+    @test size(df, 1) == 13
+    @test size(df, 2) == 4
+end
--- a/test/no_test/FACT_jobs.jl
+++ b/test/no_test/FACT_jobs.jl
+@testset "FACTDataREader FACT_jobs test" begin
+    df1 = get_geo_objects("no_counties")
+    resize!(df1, 3)
+    year = 2018
+    agg_lvl = 2
+    selection_dict = Dict(
+        "year" => year,
+        "geo_id" => df1[!, :geo_id],
+        "agg_level" => agg_lvl
+    )
+    # TEST of REA
+    df_rea = get_annual_jobs("eu_rea", selection_dict)
+    # Test if the result is a DataFrame
+    @test df_rea isa DataFrame
+    # Test for expected number of rows/columns
+    @test size(df_rea, 1) == 39
+    @test size(df_rea, 2) == 5
+    # TEST of LFS
+    selection_dict_NO = Dict(
+        "year" => year,
+        "geo_id" => ["NO"]
+    )
+    df_lfs = get_annual_jobs("eu_lfs", selection_dict_NO)
+    # Test if the result is a DataFrame
+    @test df_lfs isa DataFrame
+    # Test for expected number of rows/columns
+    @test size(df_lfs, 1) == 85
+    @test size(df_lfs, 2) == 5
+    # TEST of SBS
+    selection_dict_NO_21 = Dict(
+        "year" => 2021,
+        "geo_id" => ["NO"]
+    )
+    df_sbs = get_annual_jobs("eu_sbs", selection_dict_NO_21)
+    # Test if the result is a DataFrame
+    @test df_lfs isa DataFrame
+    # Test for expected number of rows/columns
+    @test size(df_lfs, 1) == 85
+    @test size(df_lfs, 2) == 5
+end
--- a/test/runtests.jl
+++ b/test/runtests.jl
 using Test
+using DataFrames
 using FACTDataReader

-@testset "FACTDataREader Tests" begin
-    include("nm_test.jl")
+@testset "FACTDataREader New Mexico Tests" begin
+    #include("nm_test/FACT_geo.jl")
+    #include("nm_test/FACT_jobs.jl")
 end
+
+@testset "FACTDataREader Norway Tests" begin
+    #include("no_test/FACT_geo.jl")
+    #include("no_test/FACT_jobs.jl")
+end
\ No newline at end of file