diff --git a/HESC/.Rhistory b/HESC/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/HESC/README_hesc.md b/HESC/README_hesc.md new file mode 100644 index 0000000..0eccf16 --- /dev/null +++ b/HESC/README_hesc.md @@ -0,0 +1,163 @@ +# multivariate-weather-data-clustering for HESC branch + +## Download + +There are three ways to Download and Manage the MWDC package: + +1 - Use [GitHub Desktop](https://desktop.github.com/) (Recomended) + +2 - Use command line: + +```bash + git clone https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +``` + +\*Because the repository is private the command line method is not Recomended. + +3 - Download the `.zip` file and use it. + +4 - On Google Colab use the command below. + +```bash +!git clone https://{clasic_access_token}@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +``` + +\*\* This is how to generat [clasic_access_token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). + +## Installation + +#### 1. On PC + +To install the package you need to create an environment using [pip](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) or [conda](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). + +##### Conda environment setup +```bash +conda create -n mwdc pandas numpy xarray netCDF4 matplotlib scikit-learn scipy dask +conda activate mwdc +``` + +After that just clone this repository and install the ` setup.py` file inside it. + +```bash + cd multivariate-weather-data-clustering + python setup.py install +``` + +Note: If you are using macOS, you should use ` python3 setup.py install` instead. + +#### 2. On Google Colab + +After cloning the repository just run the command below to install it. + +```bash + %cd multivariate-weather-data-clustering + !python setup.py install +``` + +## Usage + +To use the functions you just need to import them from MWDC. Modules could be imported either seperately or all together. + +```python +from mwdc import * + +## or ## + +from mwdc.preprocessing import preprocessing +from mwdc.evaluation import st_evaluation +from mwdc.visualization import visualization + +``` + +Example: + +```python +trans_data = preprocessing.datatransformation(data) +``` + +## Modules Documentation + +### preprocessing + +| Functions | Description | +| :--------------------- | :------------------------------------------------------------------------------- | +| `transformddaily()` | Transformation function for Daily Data | +| `transformdmock()` | Transformation function for Mock Data | +| `transformqm()` | Variable for Quater Map | +| `datatransformation()` | Description in the Note below\* | +| `datanormalization()` | Input in this case will be the transformed pandas dataframe | +| `null_fill()` | Function to input NaN values across variables | +| `pca1()` | data is data to be input , n is the number of components | +| `pcacomponents()` | Showing the proper number of components for pca by computing cumulative variance | +| `data_preprocessing()` | Transforms the xArray input data into a 2D NumPy Array. | + +\*Note: This function is used to transform the xarray dataset into a pandas dataframe where the dimension "time" would become the index of the DataFrame and, +pairs of both dimensions "latitude" and "longitude" will become the columns for each variable + +### clustering + +#### - DBscan + +| Functions | Description | +| :------------------------------- | :-------------------------------------------------------------- | +| `dbscanreal(x, eps1=0.5, min=5)` | eps1 for epsilon , min for minimum samples, x is for data input | + +#### - Agglomerative Clustering + +| Functions | Description | +| :------------------------------- | :-------------------------------------------------------------- | +| `st_agglomerative(data, n, K, p=7, affinity, linkage)| n=PCA components, K=number of clusters, p=truncate_mode. + +#### - Kmeans + +| Functions | Description | +| :----------------------------------------------------------------------------- | :---------- | +| `Kmeans(n_cluster).fit(xarray_data, PCA=(boolian), pass_trans_data=(boolian))` | \* | +| `Kmeans(n_cluster).evaluate(z, PCA=(boolian), pass_trans_data=(boolian))` | \*\* | + +\* This function fits the K-means model to the data that is passed to it. + Parameters that this function will accept are as follows: + +1. xarray_data = string of the name of the original xarray file +2. PCA (bool) = whether or not PCA has to be applied. Default value is True. +3. pass_trans_data (bool) = whether saved data has to be passed. If False, data will be transformed instantly. Default value is True. + +\*\* This function evaluates and assigns data points to clusters. +Parameters that this function will accept are as follows: + +1. z = string of the name of the original xarray file. +2. PCA (bool) = whether or not PCA has to be applied. Default value is True. +3. pass_trans_data (bool) = whether saved data has to be passed. If False, data will be transformed instantly. Default value is True. + +#### - evaluation + +| Functions | Params | +| :--------------------------- | :---------------------------------------------------------------------------------------------------- | +| `st_rmse()` | input,formed_clusters | +| `st_corr()` | input,formed_clusters | +| `st_calinski()` | input,formed_clusters | +| `davies_bouldin()` | input, formed_clusters | +| `compute_silhouette_score()` | X, labels,transformation=False, \*, metric="euclidean", sample_size=None, random_state=None, \*\*kwds | + +#### - visualization + +| Functions | Params | +| :------------------- | :------------------------------------ | +| `visualization()` | data_file,cluster_filename,coast_file | +| `make_Csv_cluster()` | label,name | + +\* Parameters that `visualization()` will accept are as follows: + +1. data_file is the .nc file. + \- Example data_file = 'path/data.nc' It is the raw unprocessed data. +2. cluster_filename is the csv file which contains clusterid and time_step. + \- Example cluster_filename = 'path/clusters.csv' # This file contains what cluster belongs to what date. +3. coast_file = This file contains the data of how a coastline should look like in the result. + \- Example 'path/coast.txt'. + +#### + +\* Parameters that `make_Csv_cluster()` will accept are as follows: + +1. label contains the clusterids. +2. Name is the file name that will generated eg:('test.csv'). diff --git a/HESC/all_models.py b/HESC/all_models.py index 454d7c1..abf724e 100644 --- a/HESC/all_models.py +++ b/HESC/all_models.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """all_models.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1Vw2_E0x8GozGufXJghJ28Hcg8KsUOrS7 """ from google.colab import drive @@ -15,18 +11,27 @@ # Install dask.dataframe !pip install "dask[dataframe]" !pip install netCDF4 +!pip install PyMetis +!pip install kahypar -import pandas as pd +import os +import warnings +from typing import Optional import numpy as np -import time +import pandas as pd +import xarray as xr +import pymetis +import kahypar +from scipy import sparse +from sklearn.metrics import pairwise_distances, normalized_mutual_info_score +from sklearn.utils.extmath import safe_sparse_dot + +from sklearn.metrics import silhouette_score, pairwise_distances, davies_bouldin_score +from sklearn.cluster import KMeans from matplotlib import pyplot as plt from matplotlib.pyplot import cm -#import netCDF4 -# from netCDF4 import Dataset -# import netCDF4 as nc import random import netCDF4 as nc -import xarray as xr import datetime import datetime as dt from netCDF4 import date2num,num2date @@ -44,45 +49,25 @@ from scipy.sparse import csr_matrix from sklearn.mixture import GaussianMixture from sklearn.metrics import adjusted_rand_score - import seaborn as sns -import xarray as xr -#from mwdc.clustering.st_agglomerative import st_agglomerative - -import warnings -warnings.filterwarnings("ignore") - -# import netCDF4 -# from netCDF4 import Dataset -from sklearn.preprocessing import StandardScaler from scipy.cluster.hierarchy import dendrogram, linkage import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering -# from mwdc.visualization.clusterplotting import clusterPlot2D -# from mwdc.visualization.visualization import visualization2 -# from mwdc.preprocessing.preprocessing import data_preprocessing -# from mwdc.evaluation.st_evaluation import st_rmse_df, st_corr, st_rmse_np -# from mwdc.clustering.st_agglomerative import st_agglomerative - import sys import pickle import matplotlib as mpl import matplotlib.colors as colors -import os -import xarray as xr -import warnings -warnings.filterwarnings("ignore") -!pip install netCDF4 +warnings.filterwarnings("ignore") -from sklearn.metrics import silhouette_samples, silhouette_score def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): return np.mean(silhouette_samples(X, labels, metric="cosine", **kwds)) ## This function will will pre-process our daily data for DEC model as numpy array from sklearn import preprocessing -from sklearn.preprocessing import MinMaxScaler + +warnings.filterwarnings("ignore") def data_preprocessing(data_path): rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' @@ -217,20 +202,11 @@ def avg_inter_dist(norm_data, clustering_results): """# **Implementation**""" -#path2 = ('/content/drive/MyDrive/Data/mock_v4.nc') path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily_smalldomain.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_hourly.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_hourly_smalldomain.nc') data = xr.open_dataset(path2, decode_times=False)#To view the date as integers of 0, 1, 2,.... -#data = xr.open_dataset(path2)# decode_times=False) #To view the date as integers of 0, 1, 2,.... -#data5 = xr.open_dataset(path2) # To view time in datetime format -var = list(data.variables)[3:] -data var = list(data.variables)[3:] -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -238,102 +214,51 @@ def avg_inter_dist(norm_data, clustering_results): data_nor_nor = data_nor -DSC_cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +################################################################### -DSC_cluster_result1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.34805576652439557.pkl", "rb")) +##DSC homogeneous clustering results +DSC_cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) -DSC_cluster_result2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) +DSC_cluster_result1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.34805576652439557.pkl", "rb")) -DSC_cluster_result +DSC_cluster_result2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) -silh = silhouette_score1(data_nor, DSC_cluster_result) -u,indices = np.unique(DSC_cluster_result,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) +################################################################### -data_nor_eval = data_nor +##DEC homogeneous clustering results +DEC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) -result = DSC_cluster_result +DEC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30096328.pkl", "rb")) -from sklearn.metrics import davies_bouldin_score +DEC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30640745.pkl", "rb")) -db = davies_bouldin_score(data_nor, DSC_cluster_result) -print("Davies-Bouldin score is ", db) +DEC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) -from sklearn.metrics import calinski_harabasz_score -ch = calinski_harabasz_score(data_nor, DSC_cluster_result) -print("Davies-Bouldin score is ", ch) - -print("RMSE score is ", total_rmse('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc', DSC_cluster_result)) +################################################################### -print("Variance is ", avg_var(data_nor, DSC_cluster_result)) +##DTC homogeneous clustering results +DTC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) -print("Inter-cluster distance ", avg_inter_dist(data_nor, DSC_cluster_result)) +DTC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) -DEC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) +DTC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "rb")) -DEC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30096328.pkl", "rb")) +DTC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) -DEC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30640745.pkl", "rb")) +DTC_Cluster_results5 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) -DEC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) +################################################################### -DEC_Cluster_results1 +##KMeans homogeneous clustering results +KMeans_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "rb")) -silh = silhouette_score1(data_nor, DEC_Cluster_results1) -u,indices = np.unique(DEC_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) +KMeans_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.313.pkl", "rb")) -DTC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) +KMeans_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "rb")) -DTC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) - -DTC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "rb")) - -DTC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) - -DTC_Cluster_results5 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) - -DTC_Cluster_results1 - -silh = silhouette_score1(data_nor, DTC_Cluster_results1) -u,indices = np.unique(DTC_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) - -KMeans_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "rb")) - -KMeans_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.313.pkl", "rb")) - -KMeans_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "rb")) - -KMeans_Cluster_results1 - -silh = silhouette_score1(data_nor, KMeans_Cluster_results1) -u,indices = np.unique(KMeans_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) - -from sklearn.metrics import davies_bouldin_score - -db = davies_bouldin_score(data_nor, KMeans_Cluster_results1) -print("Davies-Bouldin score is ", db) - -from sklearn.metrics import calinski_harabasz_score -ch = calinski_harabasz_score(data_nor, KMeans_Cluster_results1) -print("Davies-Bouldin score is ", ch) - -print("RMSE score is ", total_rmse('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc', KMeans_Cluster_results1)) - -print("Variance is ", avg_var(data_nor, KMeans_Cluster_results1)) - -print("Inter-cluster distance ", avg_inter_dist(data_nor, KMeans_Cluster_results1)) +################################################################### +##Main Heterogeneous Ensemble """# **HESC_performance**""" class ClusterSimilarityMatrix(): @@ -378,7 +303,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) # To normalize a matrix @@ -434,11 +359,11 @@ def to_binary_matrix(self, y_clusters): from sklearn.metrics import pairwise_distances -dsc_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_fin_ens_0.32257442534935293.pkl", "rb")) -dec_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) -km_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/kmeans_fin_ens_0.32412578566580286.pkl", "rb")) +dsc_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_fin_ens_0.32257442534935293.pkl", "rb")) +dec_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) +km_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/kmeans_fin_ens_0.32412578566580286.pkl", "rb")) -#dtc_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) +#dtc_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) @@ -469,7 +394,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) @@ -511,7 +436,7 @@ def to_binary_matrix(self, y_clusters): from sklearn.decomposition import NMF -sim_matrixx = np.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy') +sim_matrixx = np.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy') #sim_matrixx @@ -614,25 +539,6 @@ def best_clustering5(n): """# **Hybrid Bipartite Graph Formulation (HBGF)**""" -!pip install PyMetis - -!pip install kahypar - -import os -import warnings -from typing import Optional -import numpy as np -import pandas as pd -import xarray as xr -import pymetis -import kahypar -from scipy import sparse -from sklearn.metrics import pairwise_distances, normalized_mutual_info_score -from sklearn.utils.extmath import safe_sparse_dot - -from sklearn.metrics import silhouette_score, pairwise_distances, davies_bouldin_score -from sklearn.cluster import KMeans - def create_hypergraph(base_clusters): """Create the incidence matrix of base clusters' hypergraph @@ -722,7 +628,7 @@ def hbgf(base_clusters, nclass): print(silh) print(u,indices) -#path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +#path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' pickle.dump(label_hbgf, open(path + 'HBGF_ens_' + str(silh) + '.pkl', "wb")) silh = silhouette_score1(data_nor, label_hbgf) @@ -793,7 +699,7 @@ def best_clustering(n): """# **HESC_hbgp**""" -hbgp = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/HBGF_enss_0.30932982840030593.pkl", "rb")) +hbgp = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/HBGF_enss_0.30932982840030593.pkl", "rb")) silh = silhouette_score1(data_nor_eval, hbgp) u,indices = np.unique(hbgp,return_counts = True) # sc=0.3412 st 64 @@ -815,9 +721,9 @@ def best_clustering(n): -co_occ = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35889223651046626.pkl", "rb")) +co_occ = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35889223651046626.pkl", "rb")) -nnmf = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/nmf_ens0.35258519039733466.pkl", "rb")) +nnmf = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/nmf_ens0.35258519039733466.pkl", "rb")) NUM_alg = 100 occurrence_threshold = 0.2 @@ -839,7 +745,7 @@ def best_clustering(n): print(final_matrix.shape) print(unique_labels) -#final_labels1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) +#final_labels1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) ch_index1s2 = calinski_harabasz_score(data_nor, nnmf) print(ch_index1s2) @@ -892,14 +798,14 @@ def best_clustering(n): -final_labels1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) +final_labels1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) ch_index12 = calinski_harabasz_score(data_nor, final_labels1) print(ch_index12) """# **DTC**""" -final_labels2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_22.pkl", "rb")) +final_labels2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_22.pkl", "rb")) ch_index2 = calinski_harabasz_score(data_nor, final_labels2) @@ -907,16 +813,16 @@ def best_clustering(n): """# **DSC**""" -final_labels3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) -#/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.32763868041010935.pkl +final_labels3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) +#/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.32763868041010935.pkl ch_index3 = calinski_harabasz_score(data_nor, final_labels3) print(ch_index3) """# **DEC**""" -final_labels4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) +final_labels4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) ch_index4 = calinski_harabasz_score(data_nor, final_labels4) -print(ch_index4) \ No newline at end of file +print(ch_index4) diff --git a/HESC/base_models/41_dac_multivariate_data_11_18_23.py b/HESC/base_models/41_dac_multivariate_data_11_18_23.py index ed7d12e..d1d6092 100644 --- a/HESC/base_models/41_dac_multivariate_data_11_18_23.py +++ b/HESC/base_models/41_dac_multivariate_data_11_18_23.py @@ -1,11 +1,3 @@ -# -*- coding: utf-8 -*- -"""41_DAC_Multivariate_Data_11-18-23.ipynb - -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1rWucbhqY1LHqAUw8avzhNP9hm7d_gaJ4 -""" @@ -126,7 +118,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -316,7 +308,7 @@ def train(): model_name = 'DAC_model_final_' + str(round(time()))+ '.ckpt' #save_path = saver.save(sess, 'DAC_models/' + model_name) - save_path = saver.save(sess, '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DAC/res/' + model_name) + save_path = saver.save(sess, '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DAC/res/' + model_name) print("Model saved in file: %s" % save_path) print("Total epochs: %d" % epoch) @@ -579,7 +571,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) # To normalize a matrix diff --git a/HESC/base_models/dec_clustering_model.py b/HESC/base_models/dec_clustering_model.py index d5bf6bc..cf7ccfe 100644 --- a/HESC/base_models/dec_clustering_model.py +++ b/HESC/base_models/dec_clustering_model.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""DEC-Clustering-Model.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1cUcuKqCzQNeei3W-uAW5TDa_foLb14_f -""" diff --git a/HESC/base_models/dsc_model_2.py b/HESC/base_models/dsc_model_2.py index c97339a..6967352 100644 --- a/HESC/base_models/dsc_model_2.py +++ b/HESC/base_models/dsc_model_2.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""DSC_Model_2.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1jtLQxGTtiRrhZ5gxwCR6dWYmppXZ8ARW -""" from google.colab import drive drive.mount('/content/drive') @@ -216,8 +209,8 @@ def compile(self, optimizer='sgd', loss='kld'): self.model.compile(optimizer=optimizer, loss=['mse', 'kld']) def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, - update_interval=140, save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_2/saved'): - + update_interval=140, save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_2/saved'): + print('Update interval', update_interval) save_interval = 500 print('Save interval', save_interval) @@ -312,7 +305,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], 1, rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -331,7 +324,7 @@ def main(): maxiter = 2e4 # Maximum number of times the model traning will iterate update_interval = 50 # After each interval the clustering weights will be modified tol = 0.0000001 # If there is a cluster change more than this tollerance the model training will run - save_dir = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_2/saved' # The trained model will be stored here + save_dir = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_2/saved' # The trained model will be stored here # load dataset x = data_clustering # Input dataset of the transformed daily data @@ -631,14 +624,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -647,7 +640,7 @@ def to_binary_matrix(self, y_clusters): -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) """# **Evaluation Metrics**""" diff --git a/HESC/base_models/dtc_clustering_model.py b/HESC/base_models/dtc_clustering_model.py index 5d4f9b7..73409f6 100644 --- a/HESC/base_models/dtc_clustering_model.py +++ b/HESC/base_models/dtc_clustering_model.py @@ -1,11 +1,3 @@ -# -*- coding: utf-8 -*- -"""DTC-Clustering-Model.ipynb - -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1KsUhZSXy92wUviCSKDas0mTogDkkhYb3 -""" from google.colab import drive drive.mount('/content/drive') @@ -511,7 +503,7 @@ def pretrain(self, X, optimizer='adam', epochs=10, batch_size=64, - save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model', + save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model', verbose=1): """ Pre-train the autoencoder using only MSE reconstruction loss @@ -543,7 +535,7 @@ def fit(self, X_train, y_train=None, tol=0.001, patience=5, finetune_heatmap_at_epoch=8, - save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model'): + save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model'): """ Training procedure # Arguments @@ -748,7 +740,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor = np.float32(rdata_daily_np_array_T_R_nor) # convert the data type to float32, otherwise the loass will be out-of-limit return rdata_daily_np_array_T_R_nor -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_path = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') nor_data = data_preprocessing(data_path) @@ -788,7 +780,7 @@ class Config(object): finetune_heatmap_at_epoch = 8 #, type=int, help='epoch where heatmap finetuning starts') initial_heatmap_loss_weight = 0.1 #, type=float, help='initial weight of heatmap loss vs clustering loss') final_heatmap_loss_weight = 0.9 #, type=float, help='final weight of heatmap loss vs clustering loss (heatmap finetuning)') - save_dir = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model' + save_dir = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model' @@ -1150,14 +1142,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -1166,7 +1158,7 @@ def to_binary_matrix(self, y_clusters): pickle.dump(final_labels, open(path + 'DSC_fin_ens_' + str(silh) + '.pkl', "wb")) -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) """# **Evaluation Metrics**""" @@ -1251,12 +1243,12 @@ def best_clustering(n): -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) -# result_1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) -# result_2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) -# result_3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) -# result_4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) -# result_5 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_02014.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) +# result_1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) +# result_2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) +# result_3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) +# result_4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) +# result_5 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_02014.pkl", "rb")) clusters_list = [ result_1, result_2, result_3, result_4,result_5] @@ -1297,14 +1289,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DTC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DTC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -1363,9 +1355,9 @@ def best_clustering5(n): -pickle.dump(final_labels, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "wb")) +pickle.dump(final_labels, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/test_clustering_results30.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/test_clustering_results30.pkl", "rb")) @@ -1588,7 +1580,7 @@ def get_siz(self, x): df1['clusterid'] = clusters #df1["cluster"] = cluster.labels_ df1['clusterid'].value_counts() -df1.to_csv("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DEC_clustering.csv") +df1.to_csv("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DEC_clustering.csv") df1 df1.groupby('clusterid').count() @@ -1603,9 +1595,9 @@ def get_siz(self, x): import pickle -pickle.dump(clusters, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "wb")) +pickle.dump(clusters, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "rb")) """**Evaluations**""" diff --git a/HESC/base_models/kmeans_ensemble.py b/HESC/base_models/kmeans_ensemble.py index c6b8fd6..6bc2b0e 100644 --- a/HESC/base_models/kmeans_ensemble.py +++ b/HESC/base_models/kmeans_ensemble.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """KMeans_ensemble.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1q4Opc27RF8qYi_IUzcNL-BKXNLnfIipH [Source](https://www.kaggle.com/code/thedevastator/how-to-ensemble-clustering-algorithms-updated) """ @@ -117,7 +113,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -317,7 +313,7 @@ def nor_get_clusters_and_centers(input,formed_clusters): print(silh) print(u,indices) -#pickle.dump(result_3, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "wb")) +#pickle.dump(result_3, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "wb")) silhouette, result_4 = best_clustering(20) @@ -327,7 +323,7 @@ def nor_get_clusters_and_centers(input,formed_clusters): print(silh) print(u,indices) -#pickle.dump(result_4, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "wb")) +#pickle.dump(result_4, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "wb")) silhouette, result_5 = best_clustering(20) @@ -378,14 +374,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/KMeans_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/KMeans_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor = data_nor_eval -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -398,9 +394,9 @@ def to_binary_matrix(self, y_clusters): -#pickle.dump(final_labels, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "wb")) +#pickle.dump(final_labels, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "wb")) -result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) data_nor_eval = data_nor @@ -661,7 +657,7 @@ def to_binary_matrix(self, y_clusters): # print(km_sim_matrixx) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/KMeans_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/KMeans_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) def best_clustering5(n): @@ -1008,7 +1004,7 @@ def get_siz(self, x): df1['clusterid'] = clusters #df1["cluster"] = cluster.labels_ df1['clusterid'].value_counts() -df1.to_csv("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/K-Means.csv") +df1.to_csv("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/K-Means.csv") df1 df1.groupby('clusterid').count() @@ -1023,9 +1019,9 @@ def get_siz(self, x): import pickle -#pickle.dump(clusters, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "wb")) +#pickle.dump(clusters, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "rb")) base_clustering = silhouette_score1(data_nor_eval, clusters) @@ -1080,4 +1076,4 @@ def get_siz(self, x): # /content/DAC_models/DAC_model_final_1700598391.ckpt.data-00000-of-00001 -# /content/DAC_models/DAC_model_final_1700598391.ckpt.index \ No newline at end of file +# /content/DAC_models/DAC_model_final_1700598391.ckpt.index diff --git a/HESC/base_models/model_selection.py b/HESC/base_models/model_selection.py index e0b8593..9292e1d 100644 --- a/HESC/base_models/model_selection.py +++ b/HESC/base_models/model_selection.py @@ -3,8 +3,7 @@ Automatically generated by Colaboratory. -Original file is located at - https://colab.research.google.com/drive/1-eUuh7ZpQUHuUOmero5lyMZ8s2b4uYuA + # **Model Selection** @@ -991,4 +990,4 @@ def best_clustering(n): print("Variance is ", avg_var(data_nor, result_gmm)) -print("Inter-cluster distance ", avg_inter_dist(data_nor, result_gmm)) \ No newline at end of file +print("Inter-cluster distance ", avg_inter_dist(data_nor, result_gmm)) diff --git a/HESC/baseline_models/cluster_ensembles.py b/HESC/baseline_models/cluster_ensembles.py index 4fd5751..bc24b54 100644 --- a/HESC/baseline_models/cluster_ensembles.py +++ b/HESC/baseline_models/cluster_ensembles.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""Cluster_Ensembles.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1rAsFJZG3p6lrII47rE2w243oFKizap_c -""" @@ -18,7 +11,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/ # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering diff --git a/HESC/baseline_models/ensemble_sc_(1).py b/HESC/baseline_models/ensemble_sc.py similarity index 98% rename from HESC/baseline_models/ensemble_sc_(1).py rename to HESC/baseline_models/ensemble_sc.py index b2335c2..a17be15 100644 --- a/HESC/baseline_models/ensemble_sc_(1).py +++ b/HESC/baseline_models/ensemble_sc.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- -"""Ensemble_SC (1).ipynb +"""Ensemble_SC.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1IUxgFKCFBNwc-EN7mivbl9gGyXnyaTGE Paper: Ensemble Learning for Spectral Clustering Implementation of the paper “Li, H., Ye, X., Imakura, A. and Sakurai, T., 2020, November. Ensemble learning for spectral clustering. In 2020 IEEE International Conference on Data Mining (ICDM) (pp. 1094-1099). IEEE” In Python. @@ -22,7 +18,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https:// # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering diff --git a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py index 853b034..7592e67 100644 --- a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py +++ b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """Parea_multi_view_ensemble_clustering.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1IrTNUG_vH6uIUgOTDzGhNWtmIwhIlSAr Paper: Parea: multi-view ensemble clustering for cancer subtype discovery @@ -25,7 +21,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/ # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering @@ -92,66 +88,6 @@ from netCDF4 import date2num,num2date from math import sqrt -# from sklearn import preprocessing -# from sklearn.preprocessing import MinMaxScaler - -# def data_preprocessing(data_path): -# rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' -# rdata_daily_np_array = np.array(rdata_daily.to_array()) # the shape of the dailt data is (7, 365, 41, 41) -# rdata_daily_np_array_latitude = np.concatenate((rdata_daily_np_array, np.zeros((7, 365, 41,7), dtype=int)), axis=3) -# rdata_daily_np_array_longitude = np.concatenate((rdata_daily_np_array_latitude, np.zeros((7, 365, 7, 48), dtype=int)), axis=2) -# rdata_daily_np_array = rdata_daily_np_array_longitude -# rdata_daily_np_array_T = rdata_daily_np_array.transpose(1,0,2,3) # transform the dailt data from (7, 365, 41, 41) to (365, 7, 41, 41) -# overall_mean = np.nanmean(rdata_daily_np_array_T[:, :, :, :]) -# for i in range(rdata_daily_np_array_T.shape[0]): -# for j in range(rdata_daily_np_array_T.shape[1]): -# for k in range(rdata_daily_np_array_T.shape[2]): -# for l in range(rdata_daily_np_array_T.shape[3]): -# if np.isnan(rdata_daily_np_array_T[i, j, k, l]): -# #print("NAN data in ", i, j, k, l) -# rdata_daily_np_array_T[i, j, k, l] = overall_mean -# rdata_daily_np_array_T = rdata_daily_np_array_T.transpose(0,2,3,1) -# rdata_daily_np_array_T_R = rdata_daily_np_array_T.reshape((rdata_daily_np_array_T.shape[0], -1)) # transform the dailt data from (365, 7, 41, 41) to (365, 11767) -# min_max_scaler = preprocessing.MinMaxScaler() # calling the function -# rdata_daily_np_array_T_R_nor = min_max_scaler.fit_transform(rdata_daily_np_array_T_R) # now normalize the data, otherwise the loss will be very big -# #rdata_daily_np_array_T_R_nor = np.float32(rdata_daily_np_array_T_R_nor) # convert the data type to float32, otherwise the loass will be out-of-limit -# rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], 1, rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) -# # return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R - -# from sklearn import preprocessing -# from sklearn.preprocessing import MinMaxScaler - -# def datatransformation(data_path, variables): -# ''' The parameters accepted by this function are as follows: -# 1. "data_path" is the path of the netCDF4 dataset file. (data_path = '/content/drive/MyDrive/ERA5_meteo_sfc_2021_daily.nc') -# 2. "variables" is an array of the variable names of the netCDF4 dataset those we want to read. (variables = ['sst', 'sp']) -# If the "variables" array is empty the function will read the whole dataset. - -# Return value: -# The function will return the normalized values of the selected variables as a 2D NumPy array of size (365 x ___) and a 4D array as (365, 41, 41, ___). -# ''' - -# rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' -# if(len(variables)==0): -# rdata_daily_np_array = np.array(rdata_daily.to_array()) # the shape of the dailt data is (7, 365, 41, 41) -# else: -# rdata_daily_np_array = np.array(rdata_daily[variables].to_array()) -# rdata_daily_np_array_R = rdata_daily_np_array.reshape((rdata_daily_np_array.shape[0], -1)) #(7, 613565) -# for i in range (rdata_daily_np_array_R.shape[0]): -# tmp = rdata_daily_np_array_R[i] -# tmp[np.isnan(tmp)]=np.nanmean(tmp) -# rdata_daily_np_array_R[i] = tmp -# min_max_scaler = MinMaxScaler() # calling the function -# rdata_daily_np_array_nor = min_max_scaler.fit_transform(rdata_daily_np_array_R.T).T -# rdata_daily_np_array_nor_4D = rdata_daily_np_array_nor.reshape(rdata_daily_np_array.shape) # (7, 613565) to (7, 365, 41, 41) -# rdata_daily_np_array_nor_4D_T = rdata_daily_np_array_nor_4D.transpose(1,2,3,0) # (7, 365, 41, 41) to (365, 41, 41, 7) -# rdata_daily_np_array_nor_4D_T_R = rdata_daily_np_array_nor_4D_T.reshape((rdata_daily_np_array_nor_4D_T.shape[0], -1)) #(365, 11767) -# data_2d = rdata_daily_np_array_nor_4D_T_R -# data_4d = rdata_daily_np_array_nor_4D_T -# return data_2d, data_4d - - - from sklearn.metrics import silhouette_samples, silhouette_score def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): return np.mean(silhouette_samples(X, labels, metric="cosine", **kwds)) @@ -230,14 +166,9 @@ def parea(data): """# **Evaluation Metrics** -**Silhouette Score** +**Davies bouldin** """ -def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): - return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) - -"""**Davies bouldin**""" - def davies_bouldin_score(X, labels): return print("Davies-Bouldin score is ", davies_bouldin_score(X, labels)) @@ -424,4 +355,4 @@ def best_clustering(n): from sklearn.metrics import calinski_harabasz_score ch = calinski_harabasz_score(data_nor, result_parea) -print("Davies-Bouldin score is ", ch) \ No newline at end of file +print("Davies-Bouldin score is ", ch) diff --git a/README.md b/README.md index 0eccf16..c61f9bf 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ There are three ways to Download and Manage the MWDC package: 2 - Use command line: ```bash - git clone https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git + ``` \*Because the repository is private the command line method is not Recomended. @@ -19,7 +19,7 @@ There are three ways to Download and Manage the MWDC package: 4 - On Google Colab use the command below. ```bash -!git clone https://{clasic_access_token}@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git + ``` \*\* This is how to generat [clasic_access_token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). diff --git a/archive/feature_extraction_pca_clustering.py b/archive/feature_extraction_pca_clustering.py index f7a1521..5933b93 100644 --- a/archive/feature_extraction_pca_clustering.py +++ b/archive/feature_extraction_pca_clustering.py @@ -48,7 +48,7 @@ def image_feature(path, image_size): if __name__ == '__main__': var_name = "u10" img_path = "/Volumes/GoogleDrive/.shortcut-targets-by-id/1vfQuEpjPQbXwHTxqAw34ALMoA45PJ7KQ/ECRP_Data_Science/Zheng/new_data_images/" + var_name - work_dir = "/Users/jianwu/Data/ECRP_ERA5/version-2/csv" + work_dir = "/Users//////Data/ECRP_ERA5/version-2/csv" # the size of the new figure is 41x41 img_features, img_names = image_feature(img_path, (41, 41)) @@ -87,4 +87,4 @@ def image_feature(path, image_size): pca_clustering_path = work_dir + "/" + var_name + "_pca_clusters.csv" image_cluster.to_csv(pca_clustering_path, index=False) - print("done!") \ No newline at end of file + print("done!") diff --git a/archive/image_generation.py b/archive/image_generation.py index d4fcbee..1827709 100644 --- a/archive/image_generation.py +++ b/archive/image_generation.py @@ -77,9 +77,9 @@ def image_saving(data_variable, saving_path): if __name__ == '__main__': - data = xr.open_dataset("/Users/jianwu/Data/ECRP_ERA5/ERA5_sample_hourly_20200201-20200331.nc") + data = xr.open_dataset("/Users/////Data/ECRP_ERA5/ERA5_sample_hourly_20200201-20200331.nc") print(data.data_vars) - image_saving(data['v10'], "/Users/jianwu/Data/ECRP_ERA5/") + image_saving(data['v10'], "/Users////u/Data/ECRP_ERA5/") #for data_key in data.data_vars: # image_saving(data[data_key], "/Users/jianwu/Data/ECRP_ERA5/") diff --git a/mwdc/clustering/st_agglomerative.py b/mwdc/clustering/st_agglomerative.py index 7fc5560..b90c19b 100644 --- a/mwdc/clustering/st_agglomerative.py +++ b/mwdc/clustering/st_agglomerative.py @@ -4,7 +4,7 @@ Automatically generated by Colaboratory. Original file is located at - https://colab.research.google.com/drive/1rffzeREHHxYtKe1WDhVz8nvwRHArX_Ob + https://colab.research.google.com/drive/10GIlSmNz4WCLYnDaaP65_I4Uxswerz8h """ import pandas as pd @@ -50,14 +50,14 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr transformation: Boolean that accepts only "True" or "False" Output: - + formed_clusters: 1-D array of cluster labels classifying each data point along the time dimension to a cluster label A dataframe showing each cluster label and the correcponding cluster size. A dendrogram showing the steps in clustering - + ''' data = xr.open_dataset(input_path, decode_times=False) @@ -73,11 +73,11 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr norm_data = pca1(norm_data,n) else: - + if dim_reduction==False: print("") else: - + if transformation==False: #High dimension reduction @@ -86,7 +86,7 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr def plot_dendrogram(model, **kwargs): # Create linkage matrix and then plot the dendrogram - + # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) @@ -100,9 +100,9 @@ def plot_dendrogram(model, **kwargs): counts[i] = current_count linkage_matrix = np.column_stack([model.children_, model.distances_,counts]).astype(float) - + # Plot the corresponding dendrogram - + dendrogram(linkage_matrix, **kwargs) #List of algorithms @@ -111,7 +111,7 @@ def plot_dendrogram(model, **kwargs): ('Average Linkage', 'average'), ('Complete Linkage', 'complete'), ('Ward Linkage', 'ward')) - + #distance metrics affinity_metrics = ['cosine', 'euclidean', 'manhattan'] @@ -119,7 +119,7 @@ def plot_dendrogram(model, **kwargs): for alg_name, alg in clustering_algorithms: if alg == 'ward' and metric != 'euclidean': continue model = AgglomerativeClustering(n_clusters=K, affinity=metric, linkage=alg, compute_distances=True) - + #model.fit(data) y_model = model.fit(norm_data) labels = y_model.labels_ @@ -129,11 +129,11 @@ def plot_dendrogram(model, **kwargs): df1['Cluster'].value_counts() print(labels) print("") - + #var = list(data.variables)[3:] - rmse = st_rmse(input_path, var, labels, transformation=True) + rmse = st_rmse_df(input_path, var, labels, transformation=True) print("This is the RMSE evaluation results:") print("") display(rmse) @@ -155,7 +155,7 @@ def plot_dendrogram(model, **kwargs): calinski_harabasz = calinski_harabasz_score(df1, labels) # It is also known as the Variance Ratio Criterion print("") - print("For n_clusters =", K,"The average calinski harabasz score is :", calinski_harabasz) #Higher value of CH index means the clusters are dense and well separated, + print("For n_clusters =", K,"The average calinski harabasz score is :", calinski_harabasz) #Higher value of CH index means the clusters are dense and well separated, #although there is no “acceptable” cut-off value. print("") print("") @@ -164,7 +164,7 @@ def plot_dendrogram(model, **kwargs): # graph size plt.figure(1, figsize = (18 ,12)) - + # plot the top 7 levels of the dendrogram # No more than p levels of the dendrogram tree are displayed. A “level” includes all nodes with p merges from the last merge. plot_dendrogram(model, truncate_mode='level',p = 7, get_leaves=True, orientation='top', labels=None) diff --git a/setup.py b/setup.py index e0feabe..641b837 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,12 @@ setup( name="mwdc", version="1.5.0", - author="Jianwu Wang, Francis Nji, Omar Faruque, Rohan Salvi, Mostafa Cham", + author=".......................", author_email="", - url="https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git", + url=".................................................", install_requires=list(install_requires), packages=find_packages( exclude=("example*", "archive*", "Benchmark*")), long_description=long_description, -) \ No newline at end of file +)