20 May 2021 / PROJECTS

Stock Market Cluster Analysis with NetworkX

1. Data Load and Preprocessing
2. Data Visualization
- 2.1. Stock Price Volatility
- 2.2. Rolling Average of Stock Price Correlation
3. Network Analysis

1. Data Load and Preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import cluster, covariance, manifold
from community import community_louvain as louvain
import matplotlib.cm as cm
import networkx as nx
import networkx.algorithms.community as nxcom
from importlib import reload
import csv
import os
import re
%matplotlib inline

df_prices = pd.read_csv("SP500_prices.csv", index_col = 0)
df_indices = pd.read_csv("indices.csv")
df_SP500 = pd.read_csv("SP500.csv",index_col = 0)

df_prices.describe()

  
      
      open
      high
      low
      close
      volume
      adjusted
    

  
      count
      282821.000000
      282821.000000
      282821.000000
      282821.000000
      2.828210e+05
      282821.000000
    

      mean
      139.384227
      141.184891
      137.569435
      139.425703
      4.983235e+06
      137.510842
    

      std
      249.635882
      253.080583
      246.310869
      249.739199
      1.180107e+07
      249.704273
    

      min
      3.220000
      3.290000
      3.020000
      3.120000
      0.000000e+00
      3.092542
    

      25%
      47.780000
      48.419998
      47.120000
      47.790001
      1.010000e+06
      46.309925
    

      50%
      86.430000
      87.480003
      85.320000
      86.420000
      2.034800e+06
      84.330002
    

      75%
      151.500000
      153.380005
      149.600006
      151.570000
      4.528900e+06
      148.930000
    

      max
      4742.610000
      4832.800000
      4700.000000
      4776.410000
      4.286171e+08
      4776.410000
    

	open	high	low	close	volume	adjusted
count	282821.000000	282821.000000	282821.000000	282821.000000	2.828210e+05	282821.000000
mean	139.384227	141.184891	137.569435	139.425703	4.983235e+06	137.510842
std	249.635882	253.080583	246.310869	249.739199	1.180107e+07	249.704273
min	3.220000	3.290000	3.020000	3.120000	0.000000e+00	3.092542
25%	47.780000	48.419998	47.120000	47.790001	1.010000e+06	46.309925
50%	86.430000	87.480003	85.320000	86.420000	2.034800e+06	84.330002
75%	151.500000	153.380005	149.600006	151.570000	4.528900e+06	148.930000
max	4742.610000	4832.800000	4700.000000	4776.410000	4.286171e+08	4776.410000

df_prices.head()

  
      
      symbol
      date
      open
      high
      low
      close
      volume
      adjusted
    

  
      1
      AAPL
      2019-01-02
      38.722500
      39.712502
      38.557499
      39.480000
      148158800.0
      38.505024
    

      2
      AAPL
      2019-01-03
      35.994999
      36.430000
      35.500000
      35.547501
      365248800.0
      34.669640
    

      3
      AAPL
      2019-01-04
      36.132500
      37.137501
      35.950001
      37.064999
      234428400.0
      36.149662
    

      4
      AAPL
      2019-01-07
      37.174999
      37.207500
      36.474998
      36.982498
      219111200.0
      36.069202
    

      5
      AAPL
      2019-01-08
      37.389999
      37.955002
      37.130001
      37.687500
      164101200.0
      36.756794
    

	symbol	date	open	high	low	close	volume	adjusted
1	AAPL	2019-01-02	38.722500	39.712502	38.557499	39.480000	148158800.0	38.505024
2	AAPL	2019-01-03	35.994999	36.430000	35.500000	35.547501	365248800.0	34.669640
3	AAPL	2019-01-04	36.132500	37.137501	35.950001	37.064999	234428400.0	36.149662
4	AAPL	2019-01-07	37.174999	37.207500	36.474998	36.982498	219111200.0	36.069202
5	AAPL	2019-01-08	37.389999	37.955002	37.130001	37.687500	164101200.0	36.756794

df_prices_adj = df_prices[['symbol','date', 'adjusted']]
df_prices_adj.columns = ['symbol','date','price']
df_prices_adj.tail()

	symbol	date	price
282817	NWS	2021-03-24	23.69
282818	NWS	2021-03-25	24.42
282819	NWS	2021-03-26	24.01
282820	NWS	2021-03-29	23.39
282821	NWS	2021-03-30	23.83

df_indices.describe()

  
      Unnamed: 0
      price
    
      count
      16385.000000
      16215.000000
    
      mean
      8193.000000
      588.995521
    
      std
      4730.086416
      3309.895206
    
      min
      1.000000
      -36.980000
    
      25%
      4097.000000
      1.284350
    
      50%
      8193.000000
      4.227200
    
      75%
      12289.000000
      107.426300
    
      max
      16385.000000
      59221.230000

	Unnamed: 0	price
count	16385.000000	16215.000000
mean	8193.000000	588.995521
std	4730.086416	3309.895206
min	1.000000	-36.980000
25%	4097.000000	1.284350
50%	8193.000000	4.227200
75%	12289.000000	107.426300
max	16385.000000	59221.230000

df_indices.head()

  
      
      Unnamed: 0
      symbol
      date
      price
    

  
      0
      1
      DPROPANEMBTX
      2019-01-02
      0.641
    

      1
      2
      DPROPANEMBTX
      2019-01-03
      0.630
    

      2
      3
      DPROPANEMBTX
      2019-01-04
      0.635
    

      3
      4
      DPROPANEMBTX
      2019-01-07
      0.623
    

      4
      5
      DPROPANEMBTX
      2019-01-08
      0.628
    

	Unnamed: 0	symbol	date	price
0	1	DPROPANEMBTX	2019-01-02	0.641
1	2	DPROPANEMBTX	2019-01-03	0.630
2	3	DPROPANEMBTX	2019-01-04	0.635
3	4	DPROPANEMBTX	2019-01-07	0.623
4	5	DPROPANEMBTX	2019-01-08	0.628

df_indices = df_indices[["symbol","date","price"]]
df_indices.tail()

	symbol	date	price
16380	THREEFY5	2021-03-24	0.8123
16381	THREEFY5	2021-03-25	0.8126
16382	THREEFY5	2021-03-26	0.8354
16383	THREEFY5	2021-03-29	0.8687
16384	THREEFY5	2021-03-30	0.8818

df_SP500.head()

  
      
      symbol
      company
      identifier
      sedol
      weight
      sector
      shares_held
      local_currency
      exchange
    

  
      1
      AAPL
      Apple Inc.
      03783310
      2046251
      0.059338
      Information Technology
      161340980
      USD
      SP500
    

      2
      MSFT
      Microsoft Corporation
      59491810
      2588173
      0.055094
      Information Technology
      77110660
      USD
      SP500
    

      3
      AMZN
      Amazon.com Inc.
      02313510
      2000019
      0.040733
      Consumer Discretionary
      4376067
      USD
      SP500
    

      4
      FB
      Facebook Inc. Class A
      30303M10
      B7TL820
      0.021718
      Communication Services
      24592958
      USD
      SP500
    

      5
      GOOGL
      Alphabet Inc. Class A
      02079K30
      BYVY8G0
      0.019521
      Communication Services
      3074670
      USD
      SP500
    

	symbol	company	identifier	sedol	weight	sector	shares_held	local_currency	exchange
1	AAPL	Apple Inc.	03783310	2046251	0.059338	Information Technology	161340980	USD	SP500
2	MSFT	Microsoft Corporation	59491810	2588173	0.055094	Information Technology	77110660	USD	SP500
3	AMZN	Amazon.com Inc.	02313510	2000019	0.040733	Consumer Discretionary	4376067	USD	SP500
4	FB	Facebook Inc. Class A	30303M10	B7TL820	0.021718	Communication Services	24592958	USD	SP500
5	GOOGL	Alphabet Inc. Class A	02079K30	BYVY8G0	0.019521	Communication Services	3074670	USD	SP500

df = pd.concat([df_prices_adj,df_indices])
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='ignore')
df.set_index(['date','symbol'],inplace=True)
df=df.unstack()['price']
df.fillna(method='bfill',inplace=True)
df

symbol	A	AAL	AAP	AAPL	ABBV	ABC	ABMD	ABT	ACN	ADBE	...	XEL	XLNX	XOM	XRAY	XYL	YUM	ZBH	ZBRA	ZION	ZTS
date
2019-01-02	64.511734	31.96316	156.2589	38.505024	79.101799	71.46416	309.96	67.034943	136.179626	224.570007	...	45.400452	84.360565	60.557911	37.21114	64.63606	87.819199	100.576180	156.24	38.71991	83.337715
2019-01-03	62.135132	29.58167	161.1371	34.669640	76.495514	70.42746	302.29	63.871284	131.530212	215.699997	...	45.221561	81.184296	59.628124	37.23077	62.42029	85.610275	98.756989	146.88	38.50573	80.457184
2019-01-04	64.285828	31.53016	157.1396	36.149662	78.959961	71.24338	313.44	65.694260	136.644577	226.190002	...	45.664082	84.943359	61.826595	38.31106	65.05392	87.838394	102.129860	152.97	39.68837	83.613907
2019-01-07	65.650917	32.42568	159.4450	36.069202	80.112411	71.75211	314.80	66.678070	137.119217	229.259995	...	45.466366	87.187141	62.148109	38.99852	64.09184	87.742371	102.169182	155.29	39.84668	84.117012
2019-01-08	66.613335	31.90411	158.3368	36.756794	80.484734	72.52003	318.42	65.877518	140.586914	232.679993	...	45.993618	85.526176	62.599968	38.73336	64.69435	87.569496	99.877983	156.33	40.20985	85.369850
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2021-03-24	120.656403	21.81000	181.7300	120.089996	103.059998	115.37000	294.21	118.019997	267.549988	451.510010	...	65.580002	119.959999	56.340000	60.25000	101.11000	107.080002	157.210648	463.81	53.01000	155.429993
2021-03-25	121.714798	22.77000	185.6500	120.589996	103.879997	117.34000	294.14	119.050003	268.609985	450.989990	...	66.000000	120.029999	56.180000	60.55000	101.93000	107.379997	157.639999	461.26	54.74000	152.880005
2021-03-26	125.449112	22.93000	187.3200	121.209999	105.980003	118.73000	301.40	122.070000	280.769989	469.089996	...	66.309998	123.139999	57.709999	61.28000	104.76000	108.059998	161.320007	476.96	55.85000	156.149994
2021-03-29	125.229446	22.91000	185.0600	121.389999	106.730003	119.05000	305.77	122.230003	279.540009	469.320007	...	67.000000	122.230003	57.400002	62.06000	104.27000	109.209999	160.210007	467.07	53.89000	158.389999
2021-03-30	124.650322	24.12000	186.0700	119.900002	106.790001	119.06000	309.88	119.750000	278.549988	465.459991	...	66.010002	120.300003	56.689999	63.54000	104.88000	109.769997	161.220001	474.83	55.91000	157.039993

565 rows × 531 columns

df = (df-df.mean())/df.std()

df.describe()

symbol	A	AAL	AAP	AAPL	ABBV	ABC	ABMD	ABT	ACN	ADBE	...	XEL	XLNX	XOM	XRAY	XYL	YUM	ZBH	ZBRA	ZION	ZTS
count	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	...	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02	5.650000e+02
mean	-5.533430e-16	-1.156990e-15	-2.339132e-15	2.364284e-15	-3.420666e-15	-1.307902e-15	7.797106e-16	4.426744e-15	8.551665e-16	4.326136e-15	...	-1.237476e-14	4.099769e-15	7.545587e-16	-1.760637e-16	-7.646194e-15	-1.936701e-15	-2.678683e-15	-1.810941e-15	-2.867323e-15	-3.068539e-15
std	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	...	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00
min	-1.402956e+00	-1.578205e+00	-4.368070e+00	-1.422373e+00	-1.783790e+00	-1.920594e+00	-1.960001e+00	-1.883874e+00	-2.071674e+00	-1.646058e+00	...	-2.597860e+00	-2.077320e+00	-1.906102e+00	-2.730556e+00	-2.111907e+00	-4.089893e+00	-3.106090e+00	-1.373928e+00	-2.269724e+00	-2.059833e+00
25%	-7.205020e-01	-1.081580e+00	-3.269484e-01	-9.160365e-01	-8.075055e-01	-6.871508e-01	-9.956515e-01	-7.258871e-01	-7.138452e-01	-9.190687e-01	...	-5.309481e-01	-7.727544e-01	-9.708038e-01	-8.461749e-01	-5.681917e-01	-6.355256e-01	-7.359661e-01	-6.979548e-01	-1.004871e+00	-7.099711e-01
50%	-3.988059e-01	3.186508e-01	2.191719e-01	-2.702986e-01	-7.196453e-02	-2.771314e-01	2.134506e-01	-3.168558e-01	-2.189815e-01	-2.961142e-01	...	6.798383e-02	-1.999376e-01	3.236947e-01	1.573233e-01	-1.819963e-01	3.273311e-02	1.208611e-01	-2.155373e-01	2.149392e-01	-1.179682e-01
75%	6.010351e-01	8.994444e-01	5.329562e-01	1.102648e+00	7.246279e-01	7.769495e-01	8.128134e-01	8.944433e-01	7.914156e-01	1.047349e+00	...	6.942542e-01	6.930862e-01	8.373723e-01	8.655397e-01	3.657484e-01	7.911210e-01	7.965623e-01	2.228456e-01	6.929752e-01	1.075490e+00
max	2.370841e+00	1.631285e+00	2.129579e+00	2.053239e+00	2.192993e+00	2.759439e+00	1.856881e+00	2.525851e+00	2.386994e+00	1.970695e+00	...	2.121844e+00	2.439377e+00	1.533337e+00	2.122636e+00	2.542459e+00	1.801157e+00	1.843039e+00	3.113057e+00	2.442046e+00	1.818189e+00

8 rows × 531 columns

2. Data Visualization

2.1. Stock Price Volatility

%matplotlib inline
fig, ax1 = plt.subplots(figsize=(20, 15))
df.iloc[:,:20].plot(ax=ax1, legend=False)
plt.tight_layout()
plt.show()

png

df_delta = df.copy()
for column in df_delta.columns.values.tolist():
    df_delta[column] = df_delta[column]- df_delta[column].shift(1)
df_delta.iloc[0]=0
df_delta

symbol	A	AAL	AAP	AAPL	ABBV	ABC	ABMD	ABT	ACN	ADBE	...	XEL	XLNX	XOM	XRAY	XYL	YUM	ZBH	ZBRA	ZION	ZTS
date
2019-01-02	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2019-01-03	-0.132308	-0.278940	0.281110	-0.123113	-0.190242	-0.096013	-0.126329	-0.210193	-0.138905	-0.100851	...	-0.029680	-0.171040	-0.072336	0.002954	-0.197347	-0.215205	-0.106702	-0.114809	-0.028157	-0.119316
2019-01-04	0.119732	0.228224	-0.230359	0.047508	0.179889	0.075565	0.183646	0.121119	0.152796	0.119270	...	0.073418	0.202423	0.171038	0.162583	0.234564	0.217075	0.197830	0.074699	0.155477	0.130757
2019-01-07	0.075996	0.104891	0.132851	-0.002583	0.084121	0.047115	0.022400	0.065364	0.014180	0.034905	...	-0.032803	0.120826	0.025013	0.103462	-0.085687	-0.009355	0.002306	0.028457	0.020812	0.020839
2019-01-08	0.053579	-0.061091	-0.063861	0.022071	0.027177	0.071120	0.059623	-0.053189	0.103600	0.038885	...	0.087476	-0.089442	0.035154	-0.039906	0.053662	-0.016842	-0.134387	0.012757	0.047745	0.051895
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2021-03-24	-0.063369	-0.053879	0.126777	-0.078644	-0.129928	0.148182	-0.072635	-0.164107	0.045411	-0.098804	...	0.023227	-0.116853	0.087134	0.063104	0.063236	-0.089631	0.076719	0.014228	-0.074936	-0.025267
2021-03-25	0.058922	0.112443	0.225893	0.016050	0.059855	0.182449	-0.001153	0.068434	0.031668	-0.005913	...	0.069681	0.003769	-0.012448	0.045150	0.073033	0.029227	0.025183	-0.031278	0.227436	-0.105625
2021-03-26	0.207894	0.018741	0.096235	0.019902	0.153287	0.128733	0.119576	0.200649	0.363291	0.205795	...	0.051431	0.167471	0.119032	0.109865	0.252053	0.066249	0.215845	0.192574	0.145927	0.135448
2021-03-29	-0.012229	-0.002343	-0.130234	0.005778	0.054745	0.029636	0.071976	0.010631	-0.036747	0.002615	...	0.114477	-0.049003	-0.024117	0.117390	-0.043642	0.112039	-0.065105	-0.121310	-0.257674	0.092785
2021-03-30	-0.032241	0.141726	0.058202	-0.047828	0.004379	0.000926	0.067694	-0.164771	-0.029578	-0.043888	...	-0.164249	-0.103929	-0.055237	0.222739	0.054329	0.054558	0.059240	0.095183	0.265562	-0.055920

565 rows × 531 columns

%matplotlib inline
fig, ax1 = plt.subplots(figsize=(20, 15))
df_delta.plot(ax=ax1, legend=False)
plt.tight_layout()
plt.show()

png

2.2. Rolling Average of Stock Price Correlation

def calculate_corr(df_stock_returns, returns_window, corr_window_size, corr_method):
    stocks_cross_corr_dict = {}
    x_days = []
    y_mean_corr = []        
    for i in range(returns_window,len(df_stock_returns),corr_window_size):
        dic_key = i
        stocks_cross_corr_dict[dic_key]=df_stock_returns.iloc[i:(i+W)].corr(method='pearson')
        stocks_cross_corr_dict[dic_key].fillna(0,inplace=True)
        x_days.append(dic_key)
        y_mean_corr.append(np.mean([abs(j) for j in stocks_cross_corr_dict[dic_key].values.flatten().tolist()]))        
    return stocks_cross_corr_dict, x_days,y_mean_corr

%matplotlib inline
start = 21
end = 126
step = 21;
plt.figure(figsize=(20, 10))

for t in range(start, end, step):
    x_days = []
    y_mean_corr = []
    W = t
    _, x_days, y_mean_corr = calculate_corr(df,1,W, 'pearson')
    plt.plot(x_days, y_mean_corr)
    plt.xlabel('Days')
    plt.ylabel('Mean Correlation')
    l = list(range(start, end, step))
    plt.legend(l, loc='upper left')     

plt.show()

png

3. Network Analysis

3.1. Build Graph with Correlation table

# craetes a graph from correlation matrix
cor_matrix = df.corr()

cor_matrix

symbol	A	AAL	AAP	AAPL	ABBV	ABC	ABMD	ABT	ACN	ADBE	...	XEL	XLNX	XOM	XRAY	XYL	YUM	ZBH	ZBRA	ZION	ZTS
symbol
A	1.000000	-0.512211	0.340607	0.936847	0.901146	0.866659	0.476208	0.932064	0.916575	0.876950	...	0.581891	0.661159	-0.481706	0.276888	0.819616	0.232105	0.731162	0.947770	0.196208	0.830023
AAL	-0.512211	1.000000	0.452672	-0.709976	-0.567905	-0.636546	0.067317	-0.620271	-0.515571	-0.761706	...	-0.685099	0.040970	0.950026	0.414638	-0.104279	0.401344	-0.116920	-0.402034	0.675397	-0.720983
AAP	0.340607	0.452672	1.000000	0.132583	0.218725	0.129889	0.598790	0.206132	0.304547	0.062110	...	-0.102458	0.592191	0.471304	0.536131	0.517752	0.604171	0.464449	0.358634	0.691259	0.045798
AAPL	0.936847	-0.709976	0.132583	1.000000	0.883271	0.900174	0.311099	0.946722	0.931212	0.971310	...	0.752885	0.464395	-0.694573	0.138280	0.700491	0.103699	0.668931	0.859869	-0.067884	0.937015
ABBV	0.901146	-0.567905	0.218725	0.883271	1.000000	0.842135	0.379425	0.820668	0.826314	0.834858	...	0.525801	0.430849	-0.525965	0.167883	0.625416	0.017315	0.617159	0.868875	0.131018	0.756742
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
YUM	0.232105	0.401344	0.604171	0.103699	0.017315	0.148166	0.135065	0.246568	0.391225	0.042001	...	0.150926	0.517301	0.446745	0.791644	0.591224	1.000000	0.622313	0.311883	0.601129	0.211623
ZBH	0.731162	-0.116920	0.464449	0.668931	0.617159	0.647128	0.138407	0.709774	0.815626	0.581120	...	0.583386	0.473123	-0.072924	0.677935	0.793259	0.622313	1.000000	0.754226	0.495103	0.715109
ZBRA	0.947770	-0.402034	0.358634	0.859869	0.868875	0.842147	0.398625	0.902315	0.895845	0.774380	...	0.477856	0.635769	-0.332037	0.410140	0.824975	0.311883	0.754226	1.000000	0.353050	0.754373
ZION	0.196208	0.675397	0.691259	-0.067884	0.131018	0.003435	0.274263	0.036986	0.155021	-0.207014	...	-0.314988	0.431878	0.711608	0.768845	0.486766	0.601129	0.495103	0.353050	1.000000	-0.148741
ZTS	0.830023	-0.720983	0.045798	0.937015	0.756742	0.869275	0.075617	0.902965	0.917018	0.930572	...	0.901222	0.331572	-0.696158	0.206471	0.652798	0.211623	0.715109	0.754373	-0.148741	1.000000

531 rows × 531 columns

mat_pos = cor_matrix[cor_matrix>=0]
mat_pos = mat_pos.fillna(0)
symbols = cor_matrix.index.values
mat_pos = np.asmatrix(mat_pos)
G_pos = nx.from_numpy_matrix(mat_pos)
G_pos = nx.relabel_nodes(G_pos,lambda x: symbols[x])
G_pos.remove_edges_from(nx.selfloop_edges(G_pos))

mat_neg = cor_matrix[cor_matrix<0]
mat_neg = mat_neg.fillna(0)
mat_neg = abs(mat_neg)
symbols = cor_matrix.index.values
mat_neg = np.asmatrix(mat_neg)
G_neg = nx.from_numpy_matrix(mat_neg)
G_neg = nx.relabel_nodes(G_neg,lambda x: symbols[x])
G_neg.remove_edges_from(nx.selfloop_edges(G_neg))

list(G_pos.edges(data=True))[:5], list(G_neg.edges(data=True))[:5]

([('A', 'AAP', {'weight': 0.3406072572702165}),
  ('A', 'AAPL', {'weight': 0.9368465656977406}),
  ('A', 'ABBV', {'weight': 0.9011464914155253}),
  ('A', 'ABC', {'weight': 0.8666593549791352}),
  ('A', 'ABMD', {'weight': 0.476208419378593})],
 [('A', 'AAL', {'weight': 0.5122106576033231}),
  ('A', 'AEP', {'weight': 0.010066159905771194}),
  ('A', 'AFL', {'weight': 0.14527226301899576}),
  ('A', 'AIG', {'weight': 0.21700976911380374}),
  ('A', 'ALK', {'weight': 0.13546044591421322})])

symbol =  df.columns
df_sector = df_SP500[df_SP500['symbol'].isin(symbol)]
df_sector = df_sector[["symbol","sector"]]
tmp = pd.DataFrame({"symbol":df_indices["symbol"].unique(), "sector":"Macroeconomic Indices"})
df_sector = pd.concat([df_sector,tmp],ignore_index=True)
df_sector['sec_idx']= 0
for i,sec in enumerate(df_sector["sector"].unique()):
    df_sector.loc[df_sector['sector']==sec,'sec_idx'] = i+1

df_sector

	symbol	sector	sec_idx
0	AAPL	Information Technology	1
1	MSFT	Information Technology	1
2	AMZN	Consumer Discretionary	2
3	FB	Communication Services	3
4	GOOGL	Communication Services	3
...	...	...	...
526	GOLDAMGBD228NLBM	Macroeconomic Indices	12
527	IOER	Macroeconomic Indices	12
528	IORR	Macroeconomic Indices	12
529	THREEFY1	Macroeconomic Indices	12
530	THREEFY5	Macroeconomic Indices	12

531 rows × 3 columns

nx.set_node_attributes(G_pos, df_sector.set_index('symbol')['sec_idx'],'sec_idx')
nx.set_node_attributes(G_neg, df_sector.set_index('symbol')['sec_idx'],'sec_idx')

3.2. Setting threshold on weights

def set_threshold(G,threshold):
    edges_rm = list(filter(lambda e: abs(e[2]) < threshold, (e for e in G.edges.data('weight'))))    
    ids_rm = list(e[:2] for e in edges_rm)
    H = G.copy()
    H.remove_edges_from(ids_rm)
    return H

H_pos = set_threshold(G_pos,0.5)
H_neg = set_threshold(G_neg,0.5)
list(H_pos.edges(data=True))[:5], list(H_neg.edges(data=True))[:5]

([('A', 'AAPL', {'weight': 0.9368465656977406}),
  ('A', 'ABBV', {'weight': 0.9011464914155253}),
  ('A', 'ABC', {'weight': 0.8666593549791352}),
  ('A', 'ABT', {'weight': 0.932064482315709}),
  ('A', 'ACN', {'weight': 0.916574717968791})],
 [('A', 'AAL', {'weight': 0.5122106576033231}),
  ('A', 'BA', {'weight': 0.5064885797862092}),
  ('A', 'BXP', {'weight': 0.522527131570438}),
  ('A', 'CCL', {'weight': 0.5559250497113872}),
  ('A', 'DEXCAUS', {'weight': 0.6382900166453468})])

3.3. Community Detection

with Louvain Algorithm

# grid search
# for positive weights
np.random.seed(2021)
t=0
for cor_thresold in np.linspace(0.8,0.85,20):
    H_pos = set_threshold(G_pos,cor_thresold)
    partition = louvain.best_partition(H_pos)
    modularity = louvain.modularity(partition, H_pos)
    values = [partition.get(node) for node in H_pos.nodes()]
    communities = []
    tmp = list(partition.items())
    for i in range(len(set(values))):
        communities.append([n for n,c in tmp if c==i])
    sum_comm_nodes = 0
    k=0
    print("{}th Total number of Communities = {}".format(t ,len(communities)))
    for i, comm_nodes in enumerate(communities):
        if len(comm_nodes)>=10:
            k+=1
            print('community {}th: '.format(i),len(comm_nodes))
        sum_comm_nodes+=len(comm_nodes)
    t+=1
    print('n_big_communities: ',k)
# best partition with 5 big communities at cor_threshold = np.linspace(0.8,0.85,20)[8] 

...
5th Total number of Communities = 36
community 0th:  162
community 1th:  188
community 4th:  103
community 6th:  35
n_big_communities:  4
6th Total number of Communities = 36
community 0th:  172
community 1th:  180
community 3th:  113
community 4th:  25
n_big_communities:  4
7th Total number of Communities = 36
community 0th:  152
community 3th:  112
community 4th:  46
community 6th:  180
n_big_communities:  4
8th Total number of Communities = 41
community 2th:  77
community 4th:  107
community 5th:  157
community 7th:  37
community 9th:  108
n_big_communities:  5
9th Total number of Communities = 41
community 0th:  153
community 4th:  103
community 6th:  43
community 7th:  79
community 9th:  108
n_big_communities:  5
10th Total number of Communities = 41
community 0th:  146
community 2th:  84
community 4th:  109
community 6th:  41
community 9th:  106
n_big_communities:  5
11th Total number of Communities = 42
community 0th:  145
community 2th:  178
community 4th:  126
community 8th:  35
n_big_communities:  4
...

np.random.seed(2021)
cor_thresold = np.linspace(0.8,0.85,20)[8]
H_pos = set_threshold(G_pos,cor_thresold)
partition = louvain.best_partition(H_pos)
values = [partition.get(node) for node in H_pos.nodes()]
communities = []
tmp = list(partition.items())
for i in range(len(set(values))):
    communities.append([n for n,c in tmp if c==i])
sum_comm_nodes = 0
k=0
print("Total number of Communities = {}".format(len(communities)))
for i, comm_nodes in enumerate(communities):
    if len(comm_nodes)>=10:
        k+=1
        print('community {}th: '.format(i),len(comm_nodes))
    sum_comm_nodes+=len(comm_nodes)
print('n_big_communities: ',k)

Total number of Communities = 41
community 0th:  164
community 2th:  79
community 4th:  105
community 7th:  30
community 9th:  108
n_big_communities:  5

nx.set_node_attributes(H_pos,partition,'community')
values = [partition.get(node) for node in H_pos.nodes()]
plt.figure(figsize=(10,10))
nx.draw_spring(H_pos, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)

png

np.random.seed(2021)
# for negative weights
t=0
for cor_thresold in np.linspace(0.6,0.65,20):
    H_neg = set_threshold(G_neg,cor_thresold)
    partition = louvain.best_partition(H_neg)
    modularity = louvain.modularity(partition, H_neg)
    values = [partition.get(node) for node in H_neg.nodes()]
    communities = []
    tmp = list(partition.items())
    for i in range(len(set(values))):
        communities.append([n for n,c in tmp if c==i])
    sum_comm_nodes = 0
    k=0
    
    print("{}th Total number of Communities = {}".format(t ,len(communities)))
    for i, comm_nodes in enumerate(communities):
        if len(comm_nodes)>=10:
            k+=1
            print('community {}th: '.format(i),len(comm_nodes))
        sum_comm_nodes+=len(comm_nodes)
    t+=1
    print('n_big_communities: ',k)
# best partition with 5 big communities at cor_threshold = np.linspace(0.6,0.65,20)[11] 

...
9th Total number of Communities = 37
community 0th:  104
community 1th:  83
community 2th:  177
community 5th:  119
community 6th:  16
n_big_communities:  5
10th Total number of Communities = 40
community 0th:  103
community 1th:  83
community 2th:  177
community 3th:  14
community 6th:  119
n_big_communities:  5
11th Total number of Communities = 41
community 0th:  104
community 1th:  83
community 2th:  176
community 3th:  13
community 6th:  119
n_big_communities:  5
12th Total number of Communities = 42
community 0th:  103
community 1th:  81
community 3th:  13
community 6th:  130
community 11th:  167
n_big_communities:  5
13th Total number of Communities = 42
community 0th:  106
community 1th:  81
community 2th:  176
community 6th:  118
community 7th:  13
n_big_communities:  5
14th Total number of Communities = 43
community 0th:  204
community 1th:  165
community 2th:  18
community 5th:  105
n_big_communities:  4
...

np.random.seed(2021)
cor_thresold = np.linspace(0.6,0.65,20)[11] 
H_neg_otim = set_threshold(G_neg,cor_thresold)
partition = louvain.best_partition(H_neg_otim)
modularity = louvain.modularity(partition, H_neg)
values = [partition.get(node) for node in H_neg.nodes()]
communities = []
tmp = list(partition.items())
for i in range(len(set(values))):
    communities.append([n for n,c in tmp if c==i])
sum_comm_nodes = 0
k=0
print("Total number of Communities = {}".format(len(communities)))
for i, comm_nodes in enumerate(communities):
    if len(comm_nodes)>=10:
        k+=1
        print('community {}th: '.format(i),len(comm_nodes))
    sum_comm_nodes+=len(comm_nodes)
print('n_big_communities: ',k)

Total number of Communities = 41
community 0th:  102
community 1th:  83
community 3th:  13
community 6th:  130
community 11th:  167
n_big_communities:  5

nx.set_node_attributes(H_neg, partition,'community')
values = [partition.get(node) for node in H_neg.nodes()]
plt.figure(figsize=(10,10))
nx.draw_spring(H_neg, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)

png

3.4. Visualization with Gephi

nx.write_graphml(H_pos, "model_H_pos.graphml")
nx.write_graphml(H_neg, "model_H_neg.graphml")

png

Stock Market Cluster Analysis with NetworkX

1. Data Load and Preprocessing

2. Data Visualization

2.1. Stock Price Volatility

2.2. Rolling Average of Stock Price Correlation

3. Network Analysis

3.1. Build Graph with Correlation table

3.2. Setting threshold on weights

3.3. Community Detection

3.4. Visualization with Gephi

K-POP Fandom Data Analysis with networkX

R - Air Pollution Data Analysis

1. Data Load and Preprocessing

2. Data Visualization

2.1. Stock Price Volatility

2.2. Rolling Average of Stock Price Correlation

3. Network Analysis

3.1. Build Graph with Correlation table

3.2. Setting threshold on weights

3.3. Community Detection

3.4. Visualization with Gephi

Search Darron's Devlog