In [32]:
from google.colab import drive
drive.mount('/content/drive/' ,force_remount=True)
Mounted at /content/drive/

Transactional Table Data¶

In [33]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Retail_Data_Transactions.csv')
df
Out[33]:
customer_id trans_date tran_amount
0 CS5295 11-Feb-13 35
1 CS4768 15-Mar-15 39
2 CS2122 26-Feb-13 52
3 CS1217 16-Nov-11 99
4 CS1850 20-Nov-13 78
... ... ... ...
124995 CS8433 26-Jun-11 64
124996 CS7232 19-Aug-14 38
124997 CS8731 28-Nov-14 42
124998 CS8133 14-Dec-13 13
124999 CS7996 13-Dec-14 36

125000 rows × 3 columns

Master Table Data¶

In [34]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Child_Smokers.csv')
df
Out[34]:
Age (years) Height (cm) FEV (litres) Sex Smoker
0 9 145 1.708 female non
1 8 171 1.724 female non
2 7 138 1.720 female non
3 9 135 1.558 male non
4 9 145 1.895 male non
... ... ... ... ... ...
649 15 152 2.278 female smoker
650 16 183 4.872 male smoker
651 16 170 4.270 male smoker
652 15 173 3.727 male smoker
653 16 160 2.795 female smoker

654 rows × 5 columns

In [35]:
abc = df[['Age (years)', 'Height (cm)']]
abc
Out[35]:
Age (years) Height (cm)
0 9 145
1 8 171
2 7 138
3 9 135
4 9 145
... ... ...
649 15 152
650 16 183
651 16 170
652 15 173
653 16 160

654 rows × 2 columns

Time-Series Data¶

In [36]:
df = pd.read_csv('/content/drive/MyDrive/Electric_Production.csv')
df
Out[36]:
DATE IPG2211A2N
0 1/1/1985 72.5052
1 2/1/1985 70.6720
2 3/1/1985 62.4502
3 4/1/1985 57.4714
4 5/1/1985 55.3151
... ... ...
392 9/1/2017 98.6154
393 10/1/2017 93.6137
394 11/1/2017 97.3359
395 12/1/2017 114.7212
396 1/1/2018 129.4048

397 rows × 2 columns

In [37]:
from matplotlib import pyplot as plt
import seaborn as sns
date_column = 'DATE'
value_column = 'IPG2211A2N'

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')

sns.lineplot(x=date_column, y=value_column, data=df, palette='Dark2')

plt.xlabel(date_column)
plt.ylabel(value_column)
plt.title('Time Series Plot of ' + value_column)

sns.despine(fig=fig, ax=ax)
plt.show()
<ipython-input-37-664341d67888>:8: UserWarning: Ignoring `palette` because no `hue` variable has been assigned.
  sns.lineplot(x=date_column, y=value_column, data=df, palette='Dark2')
In [38]:
print(df[['DATE']])
type(df[['DATE']])
          DATE
0     1/1/1985
1     2/1/1985
2     3/1/1985
3     4/1/1985
4     5/1/1985
..         ...
392   9/1/2017
393  10/1/2017
394  11/1/2017
395  12/1/2017
396   1/1/2018

[397 rows x 1 columns]
Out[38]:
pandas.core.frame.DataFrame

Graph/Network Data¶

In [39]:
df = pd.read_csv('/content/drive/MyDrive/InputFileEdges.csv')
df
Out[39]:
from to weight type
0 s01 s02 10 hyperlink
1 s01 s02 12 hyperlink
2 s01 s03 22 hyperlink
3 s01 s04 21 hyperlink
4 s04 s11 22 mention
5 s05 s15 21 mention
6 s06 s17 21 mention
7 s08 s09 11 mention
8 s08 s09 12 mention
9 s03 s04 22 hyperlink
10 s04 s03 23 hyperlink
11 s01 s15 20 mention
12 s15 s01 11 hyperlink
13 s15 s01 11 hyperlink
14 s16 s17 21 mention
15 s16 s06 23 hyperlink
16 s06 s16 21 hyperlink
17 s09 s10 21 mention
18 s08 s07 21 mention
19 s07 s08 22 mention
20 s07 s10 21 hyperlink
21 s05 s02 21 hyperlink
22 s02 s03 21 hyperlink
23 s02 s01 23 hyperlink
24 s03 s01 21 hyperlink
25 s12 s13 22 hyperlink
26 s12 s14 22 mention
27 s14 s13 21 mention
28 s13 s12 21 hyperlink
29 s05 s09 2 hyperlink
30 s02 s10 5 hyperlink
31 s03 s12 1 hyperlink
32 s04 s06 1 mention
33 s10 s03 2 hyperlink
34 s03 s10 2 mention
35 s04 s12 3 hyperlink
36 s13 s17 1 mention
37 s14 s11 1 mention
38 s03 s11 1 hyperlink
39 s12 s06 2 mention
40 s04 s17 2 mention
41 s17 s04 4 hyperlink
42 s08 s03 2 hyperlink
43 s03 s08 4 hyperlink
44 s07 s14 4 mention
45 s15 s06 4 hyperlink
46 s15 s04 1 hyperlink
47 s05 s01 1 mention
48 s02 s09 1 hyperlink
49 s03 s05 1 hyperlink
50 s07 s03 1 mention
In [40]:
r_hyper = df.loc[df['type'] == 'hyperlink']
r_hyper
Out[40]:
from to weight type
0 s01 s02 10 hyperlink
1 s01 s02 12 hyperlink
2 s01 s03 22 hyperlink
3 s01 s04 21 hyperlink
9 s03 s04 22 hyperlink
10 s04 s03 23 hyperlink
12 s15 s01 11 hyperlink
13 s15 s01 11 hyperlink
15 s16 s06 23 hyperlink
16 s06 s16 21 hyperlink
20 s07 s10 21 hyperlink
21 s05 s02 21 hyperlink
22 s02 s03 21 hyperlink
23 s02 s01 23 hyperlink
24 s03 s01 21 hyperlink
25 s12 s13 22 hyperlink
28 s13 s12 21 hyperlink
29 s05 s09 2 hyperlink
30 s02 s10 5 hyperlink
31 s03 s12 1 hyperlink
33 s10 s03 2 hyperlink
35 s04 s12 3 hyperlink
38 s03 s11 1 hyperlink
41 s17 s04 4 hyperlink
42 s08 s03 2 hyperlink
43 s03 s08 4 hyperlink
45 s15 s06 4 hyperlink
46 s15 s04 1 hyperlink
48 s02 s09 1 hyperlink
49 s03 s05 1 hyperlink

Cross Table Data¶

In [41]:
from google.colab import drive
drive.mount('/content/drive/' ,force_remount=True)
Mounted at /content/drive/
In [42]:
df = pd.read_csv('/content/drive/MyDrive/FUBESBFWFW.csv')
df
Out[42]:
Region Urban Population (%) in 2020 Urban Population (%) in 2100 (projected) Change (2020 - 2100)
0 World 56.50% 68.50% +12.0%
1 Eastern Africa 43.10% 74.30% +31.2%
2 Central and Southern Asia 34.50% 64.50% +30.0%
3 Eastern Asia 69.10% 80.30% +11.2%
4 Northern Africa and Western Asia 74.10% 82.50% +8.4%
5 Latin America and the Caribbean 81.50% 88.20% +6.7%
6 Northern America 82.50% 85.40% +2.9%
7 Oceania 73.90% 82.20% +8.3%
8 Europe 74.40% 80.20% +5.8%

Semi-Structured¶

In [43]:
import requests
import pandas as pd
In [44]:
url = "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=ETH&limit=30&aggregate=1&e=CCCAGG" ###nested table
resp = requests.get(url)
In [45]:
data = resp.json()['Data']
In [46]:
TabWOData = resp.json()
del TabWOData['Data']
TabWOData
Out[46]:
{'Response': 'Success',
 'Type': 100,
 'Aggregated': False,
 'TimeTo': 1707004800,
 'TimeFrom': 1704412800,
 'FirstValueInArray': True,
 'ConversionType': {'type': 'invert', 'conversionSymbol': ''},
 'RateLimit': {},
 'HasWarning': False}
In [47]:
df = pd.DataFrame(data)
df
Out[47]:
time high low open volumefrom volumeto close conversionType conversionSymbol
0 1704412800 19.62 19.28 19.47 6378.10 124246.51 19.46 invert
1 1704499200 19.63 19.45 19.46 2566.18 50144.04 19.62 invert
2 1704585600 19.82 19.54 19.62 3066.93 60420.49 19.77 invert
3 1704672000 20.18 19.71 19.77 9824.44 195508.16 20.14 invert
4 1704758400 20.86 19.27 20.14 13542.65 274160.47 19.67 invert
5 1704844800 19.67 17.98 19.67 27096.81 508199.96 18.06 invert
6 1704931200 18.40 17.65 18.06 18234.48 326585.22 17.70 invert
7 1705017600 17.81 16.35 17.70 21526.49 367206.38 16.97 invert
8 1705104000 17.04 16.57 16.97 9175.30 154040.63 16.62 invert
9 1705190400 17.09 16.61 16.62 5721.08 96473.67 16.88 invert
10 1705276800 17.02 16.75 16.88 5745.29 97049.69 16.93 invert
11 1705363200 17.02 16.62 16.93 6248.24 105173.29 16.67 invert
12 1705449600 16.93 16.63 16.67 5385.49 90189.59 16.90 invert
13 1705536000 16.95 16.66 16.90 6031.27 101540.35 16.73 invert
14 1705622400 16.87 16.52 16.73 6849.00 114288.86 16.72 invert
15 1705708800 16.93 16.71 16.72 2266.45 38136.45 16.87 invert
16 1705795200 16.94 16.82 16.87 1548.69 26125.34 16.93 invert
17 1705881600 17.23 16.87 16.93 7899.95 134856.31 17.10 invert
18 1705968000 17.94 17.04 17.10 8914.91 156284.65 17.79 invert
19 1706054400 18.02 17.79 17.79 4401.59 78841.48 17.94 invert
20 1706140800 18.22 17.87 17.94 4744.42 85597.45 18.01 invert
21 1706227200 18.67 17.93 18.01 6196.83 113551.18 18.44 invert
22 1706313600 18.62 18.31 18.44 2069.10 38156.40 18.57 invert
23 1706400000 18.67 18.45 18.57 2623.36 48690.03 18.62 invert
24 1706486400 18.85 18.57 18.62 3818.25 71342.40 18.68 invert
25 1706572800 18.84 18.22 18.68 5774.91 107022.72 18.33 invert
26 1706659200 18.69 18.32 18.33 4589.09 85047.65 18.65 invert
27 1706745600 18.76 18.47 18.65 3777.89 70354.20 18.70 invert
28 1706832000 18.76 18.58 18.70 2941.62 54930.31 18.71 invert
29 1706918400 18.75 18.57 18.71 1632.05 30447.72 18.72 invert
30 1707004800 18.76 18.60 18.72 1429.95 26702.86 18.74 invert
In [48]:
df2 = pd.DataFrame(TabWOData)
df2
Out[48]:
Response Type Aggregated TimeTo TimeFrom FirstValueInArray ConversionType RateLimit HasWarning
type Success 100 False 1707004800 1704412800 True invert NaN False
conversionSymbol Success 100 False 1707004800 1704412800 True NaN False

Unstructued¶

In [49]:
!pip install scikit-image
Requirement already satisfied: scikit-image in /usr/local/lib/python3.10/dist-packages (0.19.3)
Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (1.23.5)
Requirement already satisfied: scipy>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (1.11.4)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (3.2.1)
Requirement already satisfied: pillow!=7.1.0,!=7.1.1,!=8.3.0,>=6.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (9.4.0)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (2.31.6)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (2024.1.30)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (1.5.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image) (23.2)
In [50]:
!wget https://upload.wikimedia.org/wikipedia/commons/6/60/Wget_1.13.4.png
--2024-02-04 19:26:58--  https://upload.wikimedia.org/wikipedia/commons/6/60/Wget_1.13.4.png
Resolving upload.wikimedia.org (upload.wikimedia.org)... 208.80.154.240, 2620:0:861:ed1a::2:b
Connecting to upload.wikimedia.org (upload.wikimedia.org)|208.80.154.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 96313 (94K) [image/png]
Saving to: ‘Wget_1.13.4.png.1’

Wget_1.13.4.png.1   100%[===================>]  94.06K  --.-KB/s    in 0.03s   

2024-02-04 19:26:58 (3.34 MB/s) - ‘Wget_1.13.4.png.1’ saved [96313/96313]

In [51]:
import matplotlib.pyplot as plt
img = imread('Wget_1.13.4.png')
img.shape
Out[51]:
(372, 481, 4)
In [52]:
plt.imshow(img)
Out[52]:
<matplotlib.image.AxesImage at 0x7da042acb340>