tblakes/setup.py at main · 4DModeller/tblakes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
creates data sets and prepares them for analysis.

this is done to create consistency across all notebooks.
"""

print('creating dataset for 4d-modeller...')

import pandas as pd
import geopandas as gpd
import numpy as np
import xarray as xr

# import hydrobasins data
shapefile_path = 'data/hydrobasins_lvl6/hybas_as_lev06_v1c.shp'
shapes = gpd.read_file(shapefile_path)

# import precipitation data
def get_precip_data_for_year(data_path, shapes):
    dataset = xr.open_dataset(data_path)

    sums = []

    for index, shape in shapes.iterrows():
        # Extract the geometry
        geom = shape.geometry

        # Select the data from the NetCDF file using the geometry
        # because \nearest\ method is used, even if there is no data
        # , it will calculate an average for data nearest to it
        # this makes it appear like there is data for areas where there is none (e.g., in Japan).
        # TODO : needs to label whether a shape is part of tibetan plateau or not
        data = dataset.sel(longitude=geom.centroid.x, latitude=geom.centroid.y, method='nearest')
        # data = dataset.sel(longitude=geom.centroid.x, latitude=geom.centroid.y, method=None)

        # Calculate the sum of the data
        # data_sum = data.sum()
        data_sum = data.sum()['tp'].values

        # Append the sum to the list
        sums.append(data_sum)

    return sums

years = np.arange(1984, 2022, 1)

final_data_set = pd.DataFrame()

for year in years:
    precip = get_precip_data_for_year(f'data/precip/{year}/data.nc', shapes=shapes)
    year_df = pd.DataFrame()
    year_df['precip'] = precip
    year_df['precip'] = year_df['precip'].astype(np.float64)
    year_df['year'] = year
    final_data_set = pd.concat([final_data_set, year_df])

# combine lake catchment hydrobasins data with other datas
# Calculate the centroid of each shape and add a marker to the map
shapes['centroid_lon'] = shapes.apply(lambda row: row['geometry'].centroid.x, axis=1)
shapes['centroid_lat'] = shapes.apply(lambda row: row['geometry'].centroid.y, axis=1)

final_data_set = final_data_set.join(shapes).sort_values(by=['year', 'SORT'])
final_data_set = gpd.GeoDataFrame(final_data_set)

# bounding box from precip data
xmin = 45
xmax = 107
ymin = 24
ymax = 68
final_data_set = final_data_set.cx[xmin:xmax, ymin:ymax]

# import lake changes from google earth engine
lake_df = pd.read_csv('/Users/johnaiken/repos/tblakes/data/lake_data.csv')

df = final_data_set.merge(lake_df, on=['HYBAS_ID', 'year'])
df = gpd.GeoDataFrame(df)
print('dataset is found in df')

if __name__ == "__main__":
    df.to_csv('data_for_4dm.csv')
    print('data set has been created and written to file. Have a nice day!')