import folium
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
Load Data
Load Data
= pd.read_csv(
data "../data/tree_data/2015-street-tree-census-tree-data.csv",
=["created_at"],
parse_dates="tree_id",
index_col )
data.head()
Unnamed: 0 | block_id | created_at | tree_dbh | stump_diam | curb_loc | status | health | spc_latin | spc_common | ... | boro_ct | state | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tree_id | |||||||||||||||||||||
221864 | 19575 | 107688 | 2015-09-13 | 11 | 0 | OnCurb | Alive | Poor | Pyrus calleryana | Callery pear | ... | 1013600 | New York | 40.773402 | -73.947079 | 9.989077e+05 | 221052.5156 | 5.0 | 136.0 | 1051194.0 | 1.015800e+09 |
328163 | 172248 | 213992 | 2015-10-14 | 9 | 0 | OnCurb | Alive | Good | Quercus bicolor | swamp white oak | ... | 3072600 | New York | 40.631616 | -73.933963 | 1.002579e+06 | 169398.2032 | 45.0 | 726.0 | 3214207.0 | 3.077480e+09 |
690511 | 483212 | 214423 | 2016-08-31 | 25 | 0 | OnCurb | Alive | Good | Platanus x acerifolia | London planetree | ... | 3095600 | New York | 40.635410 | -73.909161 | 1.009462e+06 | 170786.3448 | 46.0 | 956.0 | 3225006.0 | 3.080200e+09 |
290017 | 103200 | 349008 | 2015-10-06 | 7 | 0 | OnCurb | Alive | Good | Quercus palustris | pin oak | ... | 4071303 | New York | 40.728232 | -73.849770 | 1.025888e+06 | 204626.9553 | 29.0 | 71303.0 | 4051256.0 | 4.021340e+09 |
40867 | 557989 | 108216 | 2015-06-29 | 10 | 0 | OnCurb | Alive | Good | Styphnolobium japonicum | Sophora | ... | 1016001 | New York | 40.785596 | -73.953476 | 9.971335e+05 | 225494.2784 | 4.0 | 16001.0 | 1047404.0 | 1.015060e+09 |
5 rows × 45 columns
Data info
data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 221864 to 612369
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 10000 non-null int64
1 block_id 10000 non-null int64
2 created_at 10000 non-null datetime64[ns]
3 tree_dbh 10000 non-null int64
4 stump_diam 10000 non-null int64
5 curb_loc 10000 non-null object
6 status 10000 non-null object
7 health 9545 non-null object
8 spc_latin 9545 non-null object
9 spc_common 9545 non-null object
10 steward 2474 non-null object
11 guards 1209 non-null object
12 sidewalk 9545 non-null object
13 user_type 10000 non-null object
14 problems 3305 non-null object
15 root_stone 10000 non-null object
16 root_grate 10000 non-null object
17 root_other 10000 non-null object
18 trunk_wire 10000 non-null object
19 trnk_light 10000 non-null object
20 trnk_other 10000 non-null object
21 brch_light 10000 non-null object
22 brch_shoe 10000 non-null object
23 brch_other 10000 non-null object
24 address 10000 non-null object
25 postcode 10000 non-null int64
26 zip_city 10000 non-null object
27 community board 10000 non-null int64
28 borocode 10000 non-null int64
29 borough 10000 non-null object
30 cncldist 10000 non-null int64
31 st_assem 10000 non-null int64
32 st_senate 10000 non-null int64
33 nta 10000 non-null object
34 nta_name 10000 non-null object
35 boro_ct 10000 non-null int64
36 state 10000 non-null object
37 latitude 10000 non-null float64
38 longitude 10000 non-null float64
39 x_sp 10000 non-null float64
40 y_sp 10000 non-null float64
41 council district 9902 non-null float64
42 census tract 9902 non-null float64
43 bin 9871 non-null float64
44 bbl 9871 non-null float64
dtypes: datetime64[ns](1), float64(8), int64(11), object(25)
memory usage: 3.5+ MB
Missing values pie chart
= pd.DataFrame(data.isnull().mean() * 100, columns=["percantage"])
missing_values_df = missing_values_df[missing_values_df["percantage"] > 0]
missing_values_df missing_values_df
percantage | |
---|---|
health | 4.55 |
spc_latin | 4.55 |
spc_common | 4.55 |
steward | 75.26 |
guards | 87.91 |
sidewalk | 4.55 |
problems | 66.95 |
council district | 0.98 |
census tract | 0.98 |
bin | 1.29 |
bbl | 1.29 |
= px.pie(missing_values_df, values="percantage", names=missing_values_df.index)
fig ="Процент пропущенных значений в данных", title_x=0.5)
fig.update_layout(title fig.show()
features pairplot
data.head()
Unnamed: 0 | block_id | created_at | tree_dbh | stump_diam | curb_loc | status | health | spc_latin | spc_common | ... | boro_ct | state | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tree_id | |||||||||||||||||||||
221864 | 19575 | 107688 | 2015-09-13 | 11 | 0 | OnCurb | Alive | Poor | Pyrus calleryana | Callery pear | ... | 1013600 | New York | 40.773402 | -73.947079 | 9.989077e+05 | 221052.5156 | 5.0 | 136.0 | 1051194.0 | 1.015800e+09 |
328163 | 172248 | 213992 | 2015-10-14 | 9 | 0 | OnCurb | Alive | Good | Quercus bicolor | swamp white oak | ... | 3072600 | New York | 40.631616 | -73.933963 | 1.002579e+06 | 169398.2032 | 45.0 | 726.0 | 3214207.0 | 3.077480e+09 |
690511 | 483212 | 214423 | 2016-08-31 | 25 | 0 | OnCurb | Alive | Good | Platanus x acerifolia | London planetree | ... | 3095600 | New York | 40.635410 | -73.909161 | 1.009462e+06 | 170786.3448 | 46.0 | 956.0 | 3225006.0 | 3.080200e+09 |
290017 | 103200 | 349008 | 2015-10-06 | 7 | 0 | OnCurb | Alive | Good | Quercus palustris | pin oak | ... | 4071303 | New York | 40.728232 | -73.849770 | 1.025888e+06 | 204626.9553 | 29.0 | 71303.0 | 4051256.0 | 4.021340e+09 |
40867 | 557989 | 108216 | 2015-06-29 | 10 | 0 | OnCurb | Alive | Good | Styphnolobium japonicum | Sophora | ... | 1016001 | New York | 40.785596 | -73.953476 | 9.971335e+05 | 225494.2784 | 4.0 | 16001.0 | 1047404.0 | 1.015060e+09 |
5 rows × 45 columns
= data.sample(n=50000, replace=True, random_state=42) data_sample
= data_sample.select_dtypes(exclude="object").columns.to_list()
num_cols num_cols
['Unnamed: 0',
'block_id',
'created_at',
'tree_dbh',
'stump_diam',
'postcode',
'community board',
'borocode',
'cncldist',
'st_assem',
'st_senate',
'boro_ct',
'latitude',
'longitude',
'x_sp',
'y_sp',
'council district',
'census tract',
'bin',
'bbl']
= data_sample.drop_duplicates() data_sample
sum() data_sample.duplicated().
0
sns.pairplot(data_sample[num_cols])
Correlation data
=(20, 15))
plt.figure(figsize
sns.heatmap(="spearman"),
data_sample[num_cols].corr(method=True,
annot="coolwarm",
cmap=".2f",
fmt=0.5,
linewidths )
Tree mapping
# Make an empty map
= folium.Map(location=[20, 0], tiles="OpenStreetMap", zoom_start=2) m
import random
= random.randint(0, 10000)
random_start for i in range(0, 1000):
folium.Marker(=[data_sample.iloc[i]["latitude"], data_sample.iloc[i]["longitude"]],
location=data.iloc[i]["spc_latin"],
popup ).add_to(m)
m
Make this Notebook Trusted to load map: File -> Trust Notebook