Added all code plus docu. (33d2e2cd) · Commits · Publications / Predicting Creep Failure by Machine Learning

fem/apply_model.py

0 → 100644

+1488 −0

File added.

Preview size limit exceeded, changes collapsed.

fem/extract_data.py

0 → 100644

+347 −0

Original line number	Diff line number	Diff line

		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		Created on Fri Jan 10 16:29:41 2020

		- replace the element list and start stop routine by np split


		@author: shiemer
		"""

		import h5py
		import numpy as np
		import os
		import glob
		from functools import partial
		from itertools import product

		def soumya_style_postprocessing(path, h5file, dataset=None, pattern='*.csv'):
		"""
		path: str, where the FEM data lies
		h5file: name of hdf5 file, we prefer "soumya.h5"
		dataset: str, see the list at the bottom of this file,
		pattern: str, pattern to find files. Should be left unchanged

		"""
		home = os.getcwd()
		os.chdir(path)

		with h5py.File(home+'/' + h5file, 'r+') as h5file:

		files = glob.glob(pattern)


		if not dataset:
		dataset = list(h5file.keys())[0]

		# filter out files, which already have been included
		def recover_filename(key):
		return key + ".csv"

		if dataset in h5file.keys():
		keys = list(h5file[dataset].keys())
		already_read = list(map(recover_filename,keys))
		files = list(set(files) - set(already_read))
		else:
		h5file.create_group(dataset)

		i = 1

		for file in files:
		print(file)
		print(i,len(files))

		name = file.rsplit('.',1)[0]

		i = i + 1
		#drop first row as it is always zero
		data = np.loadtxt(file, dtype=float, delimiter = ',', skiprows = 1)[1:,1:]

		# get time and delete entries with same values
		time = data[:,-1]
		time = np.cumsum(time)

		# index end denotes the index, where an avalanche stops
		time,index_end = np.unique(time, return_counts=True)
		index_end = np.cumsum(index_end)

		index_start = index_end[:-2]
		index_start = np.insert(index_start,0,0)

		time_to_failure = time[-1] - time
		element_list = data[:,1]

		def event_history(start,end,element_list,size):
		"""
		start: scalar integer
		end: scalar integer
		element_list: list containing indices of failed elements
		size: size of system

		Counts every entry in the renew history with an index between
		start and end and returns the avalanche size. It also generates
		a numpy array with the same shape as the simulation grid. This
		array contains the number of renewal events per element of the
		simulation grid
		"""

		elements = element_list[start:end]

		avalanche_size = np.size(elements)
		damage_snapshot = np.zeros((int(size**2),))
		element_index, num_events = np.unique(elements, return_counts=True)
		damage_snapshot[element_index.astype(int)] = num_events

		return avalanche_size, damage_snapshot

		system_size = 64
		get_stuff = partial(event_history,
		element_list = element_list,
		size = system_size)

		event_information = np.array(list(map(get_stuff,
		index_start,
		index_end)))

		avalanche_size = event_information[:,0]
		damage_snapshots = np.stack(event_information[:,1])
		damage_snapshots = np.cumsum(damage_snapshots,axis=0)
		damage_snapshots = damage_snapshots.reshape((
		np.shape(damage_snapshots)[0],
		system_size,system_size))
		rowcol_sum = np.append(np.sum(damage_snapshots,axis=1),
		np.sum(damage_snapshots,axis=2),axis=-1)

		min = np.min(rowcol_sum, axis = -1)
		max = np.max(rowcol_sum, axis = -1)

		times = np.transpose(np.stack((time, time_to_failure)))
		system_information = np.transpose(np.stack((avalanche_size, max, min)))

		h5file.create_dataset(dataset + '/' + name + '/times',
		data=times.astype(float),
		compression="gzip",
		chunks=True)
		h5file.create_dataset(dataset + '/' + name + '/system',
		data=system_information.astype(int),
		compression="gzip",
		chunks=True)


		os.chdir(home)
		return

		def row_damage_postproc(h5file="row-new.h5",
		path=".",
		dataset=None,
		pattern='*.csv'):
		"""
		Creates a hdf5 file used for calculating other features. See Michael_features.

		path: str, where the fem data is located
		h5file: str, name of h5file to store in
		dataset: see collection at the bottom of this file
		pattern: see documentation of function above.

		"""
		home = os.getcwd()
		os.chdir(path)

		with h5py.File(home+'/' + h5file, 'r+') as h5file:

		files = glob.glob(pattern)


		if not dataset:
		dataset = list(h5file.keys())[0]

		# filter out files, which already have been included
		def recover_filename(key):
		return key + ".csv"

		if dataset in h5file.keys():
		keys = list(h5file[dataset].keys())
		already_read = list(map(recover_filename,keys))
		files = list(set(files) - set(already_read))
		else:
		h5file.create_group(dataset)

		i = 1

		for file in files:
		print(file)
		print(i,len(files))

		name = file.rsplit('.',1)[0]

		i = i + 1
		#drop first row as it is always zero
		data = np.loadtxt(file, dtype=float, delimiter = ',', skiprows = 1)[1:,1:]

		# get time and delete entries with same values
		time = data[:,-1]
		time = np.cumsum(time)

		# index end denotes the index, where an avalanche stops
		time,index_end = np.unique(time, return_counts=True)
		index_end = np.cumsum(index_end)

		index_start = index_end[:-2]
		index_start = np.insert(index_start,0,0)

		time_to_failure = time[-1] - time
		element_list = data[:,1]

		def event_history(start,end,element_list,size):
		"""
		start: scalar integer
		end: scalar integer
		element_list: list containing indices of failed elements
		size: size of system

		Counts every entry in the renew history with an index between
		start and end and returns the avalanche size. It also generates
		a numpy array with the same shape as the simulation grid. This
		array contains the number of renewal events per element of the
		simulation grid
		"""

		elements = element_list[start:end]

		avalanche_size = np.size(elements)
		damage_snapshot = np.zeros((int(size**2),))
		element_index, num_events = np.unique(elements, return_counts=True)
		damage_snapshot[element_index.astype(int)] = num_events

		return avalanche_size, damage_snapshot

		system_size = 64
		get_stuff = partial(event_history,
		element_list = element_list,
		size = system_size)

		event_information = np.array(list(map(get_stuff,
		index_start,
		index_end)))

		avalanche_size = event_information[:,0]
		damage_snapshots = np.stack(event_information[:,1])
		# calculate the cumulative damage snapshot
		damage_snapshots = np.cumsum(damage_snapshots,axis=0)
		damage_snapshots = damage_snapshots.reshape((
		np.shape(damage_snapshots)[0],
		system_size,system_size))
		rowcol_sum = np.append(np.sum(damage_snapshots,axis=1),
		np.sum(damage_snapshots,axis=2),axis=-1)

		# sort according to damage

		times = np.column_stack((time,time_to_failure))
		system_information = np.column_stack((avalanche_size, rowcol_sum))

		h5file.create_dataset(dataset + '/' + name + '/times',
		data=times.astype(float),
		compression="gzip",
		chunks=True)
		h5file.create_dataset(dataset + '/' + name + '/system',
		data=system_information.astype(int),
		compression="gzip",
		chunks=True)


		os.chdir(home)
		return

		def Michael_features(h5file, datasets,X=100, width = 8):
		"""
		h5file: str, name of h5file
		"""
		if not os.path.isfile(h5file):
		with h5py.File(h5file,"w") as datafile:
		pass

		with h5py.File(h5file,"a") as datafile:

		for dataset in datasets:
		with h5py.File("row-new.h5","r") as source:

		events = 0
		for key in source[dataset].keys():

		events += source[dataset+"/"+key+"/times"].shape[0]

		N = events/len(source[dataset].keys())
		nav = np.floor(N/X).astype(int)
		for key in source[dataset].keys():

		# calculate rate
		rate = nav/source[dataset+"/"+key+"/times"][:nav+1,0]
		dt = source[dataset+"/"+key+"/times"][nav+1:,0] -\
		source[dataset+"/"+key+"/times"][:-nav-1,0]
		rate = np.append(rate, nav/dt)

		# average avalanche amplitude
		avg_avalanche = np.cumsum(source[dataset+"/"+key+"/system"][:nav-1,0])\
		/np.arange(1,nav)
		avg_avalanche = np.append(avg_avalanche,
		np.convolve(source[dataset+"/"+key+"/system"][:,0],
		np.ones(nav)/nav,
		mode='valid'))

		# cumulative avalanche sum
		cum_damage = np.cumsum(source[dataset+"/"+key+"/system"][:,0])

		# shear band features
		windowing = partial(np.convolve,
		v=np.ones(width),
		mode="valid")
		# sum damage over all row sums
		row = np.apply_along_axis(windowing, axis=1,
		arr=source[dataset+"/"+key+"/system"][:,1:65])
		# sum damage over all column sums
		col = np.apply_along_axis(windowing, axis=1,
		arr=source[dataset+"/"+key+"/system"][:,65:])

		shear_zone = np.max(np.concatenate((row,col),axis=1),
		axis=1)

		time_information = source[dataset+"/"+key+"/times"][:]
		time_information = np.insert(time_information,1,rate,axis=1)
		system_information = np.column_stack((avg_avalanche,
		cum_damage,
		shear_zone))
		datafile.create_dataset(dataset + '/' + key + '/times',
		data=time_information, dtype=float,
		compression="gzip",
		chunks=True)
		datafile.create_dataset(dataset + '/' + key + '/system',
		data=system_information,dtype=float,
		compression="gzip",
		chunks=True)

		return

		if __name__ == '__main__':

		#---- load simulation -----
		for ind in [2,1]:

		datasets = ["csv_k1", "csv_k2", "csv_k8", "csv_k16",
		"csv_stress0.7","csv_stress0.9"]
		for dataset in datasets:
		row_damage_postproc(h5file="row-new.h5", dataset=dataset,
		pattern='*.csv', test = False)

		for w in [2,4,8,32]:
		Michael_features(h5file="michael-X-100-width-"+str(w)+".h5",
		datasets = ["csv_k1",
		"csv_stress0.7","csv_stress0.9"],
		X=100, width = w)

		for x in [50,100,200]:
		Michael_features(h5file="michael-X-"+str(x)+"-width-4.h5",
		datasets = ["csv_k1",
		"csv_stress0.7","csv_stress0.9"],
		X=x, width = 4)

fem/fig-3.py

0 → 100644

+133 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		To Do
		- Sort functions
		- rewrite dynamic functions by using masked arrays
		- clear definition of median in example samples prediction
		- calculate average and median for stupid predictors in functions
		@author: Stefan Hiemer
		"""

		import os
		from itertools import product
		from math import ceil

		import numpy as np
		import matplotlib.pyplot as plt

		import h5py

		from baseline_predictions import get_data

		def error_plot(datasets=["csv_k1",
		"csv_k2",
		"csv_stress0.7",
		"csv_k8",
		"csv_k16",
		"csv_stress0.9"],
		methods=["mean","median"],
		references=["static","dynamic"]):

		colors = plt.get_cmap("tab10")
		fig,axs = plt.subplots(int(ceil(len(datasets)/2)),2,
		sharex=True,
		figsize=(15,10))
		font=26
		n_bins=100
		ybase = dict()

		home = os.getcwd()

		i = 0
		for dataset in datasets:
		print(dataset.split("_")[1])

		#
		with h5py.File("predictions.h5","r") as f:

		# time
		time=f[dataset+"/normalized-time"][:,0]

		for method,reference in product(methods,references):
		# baseline
		ybase['-'.join([method,reference])]=f[dataset+"/time-to-failure_"+'-'.join([method,reference])][:,0]

		xtrain, xtest, ytrain, ytest = get_data(dataset = dataset,
		h5file = "soumya-data.h5",
		train_size = 1000,
		max_size = 20000,
		features = "soumya",
		window = None)

		os.chdir(home)

		# lifetimes for normilzation
		lifetimes = np.array([y[0] for y in ytest])
		length = np.array([y.shape[0] for y in ytest])
		lifetimes = np.repeat(lifetimes,length)
		ytest = np.hstack(ytest)

		# linear binning prep
		bins = np.linspace(0, 1 - 1/n_bins, n_bins).tolist()
		indices = np.digitize(time, bins)
		bin_time = (np.linspace(0, 1 - 1/n_bins, n_bins) +\
		np.linspace(0 + 1/n_bins, 1, n_bins))/2
		masks = [indices==int(l+1) for l in range(len(bins))]

		row,col = int(np.floor(i/2)),i%2
		j = 0
		for method,reference in product(methods,references):

		# calculate error
		error = np.abs(ybase['-'.join([method,reference])] - ytest)/lifetimes

		# binning
		error = np.array([np.mean(error[mask]) for mask in masks])

		#
		axs[row,col].plot(bin_time,error,"-",color=colors(j),
		label='-'.join([method,reference]))

		j += 1

		if dataset[4]=="k":
		title = "k "+dataset.split("_")[1][1:]+", load 0.7"
		else:
		title = "k 4, load "+dataset.split("_")[1][6:]

		axs[row,col].set_title(title,fontsize=font)

		if row == int(ceil(len(datasets)/2))-1:
		axs[row,col].set_xlabel(r"$t/t_{f}$",
		fontsize = font+6)
		if col == 0:
		axs[row,col].set_ylabel(r"$\|\hat{t}-t_{a}\|/t_{f}$",
		fontsize = font+6)

		axs[row,col].set_xlim(0,1)
		#axs[row,col].set_ylim(bottom=0)

		axs[row,col].tick_params(axis='both',
		which='both',
		labelsize=font)

		i+=1

		h,l = axs[int(ceil(len(datasets)/2))-1,0].get_legend_handles_labels()
		axs[0,0].legend(h,l,fontsize = font,
		loc='upper center',
		bbox_to_anchor=(1.0, 1.6),
		frameon=False,
		ncol=4)

		plt.savefig("baseline-error-plot.pdf",
		format="pdf",
		bbox_inches="tight")

		plt.show()
		return

		if __name__ == '__main__':

		error_plot()

fem/fig-5.py

0 → 100644

+109 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		To Do
		- Sort functions
		- rewrite dynamic functions by using masked arrays
		- clear definition of median in example samples prediction
		- calculate average and median for stupid predictors in functions
		@author: Stefan Hiemer
		"""

		import os

		import numpy as np
		import matplotlib.pyplot as plt

		import h5py

		from get_data import get_data

		def score_plot(datasets=["csv_k1","csv_stress0.7","csv_stress0.9"],
		method="median",reference="static"):

		colors = plt.get_cmap("tab10")
		fig,ax = plt.subplots(1,1,figsize=(15,10))
		n_bins=100
		font=30
		min = 0

		home = os.getcwd()

		i = 0
		for dataset in datasets:

		#
		with h5py.File("predictions.h5","r") as f:

		# time
		time=f[dataset+"/normalized-time"][:,0]

		# baseline
		ybase=f[dataset+"/time-to-failure_"+'-'.join([method,reference])][:,0]

		# prediction 1
		ypred1=f[dataset+"/time-to-failure_michael-X-100-width-4"][:,0]

		# prediction 2
		ypred2=f[dataset+"/time-to-failure_soumya-data"][:,0]

		xtrain, xtest, ytrain, ytest = get_data(dataset = dataset,
		h5file = "soumya-data.h5",
		train_size = 1000,
		max_size = 20000,
		features = "soumya",
		window = None)
		os.chdir(home)
		ytest = np.hstack(ytest)
		# calculate error
		error1 = np.abs(ypred1 - ytest)
		error2 = np.abs(ypred2 - ytest)
		base_error = np.abs(ybase - np.hstack(ytest))

		# linear binning
		bins = np.linspace(0, 1 - 1/n_bins, n_bins).tolist()
		indices = np.digitize(time, bins)
		bin_time = (np.linspace(0, 1 - 1/n_bins, n_bins) +\
		np.linspace(0 + 1/n_bins, 1, n_bins))/2
		masks = [indices==int(l+1) for l in range(len(bins))]
		error1 = np.array([np.mean(error1[mask]) for mask in masks])
		error2 = np.array([np.mean(error2[mask]) for mask in masks])
		base_error = np.array([np.mean(base_error[mask]) for mask in masks])

		if dataset.split("_")[1]=="k1":
		label = r"k 1, load 0.7"
		elif dataset.split("_")[1]=="stress0.7":
		label = r"k 4, load 0.7"
		elif dataset.split("_")[1]=="stress0.9":
		label = r"k 4, load 0.9"
		#
		score1 = 1- error1/base_error
		score2 = 1- error2/base_error
		ax.plot(bin_time,score1,"-",color=colors(i),
		label=label)
		ax.plot(bin_time,score2,"--",color=colors(i))

		if score1.min() < min:
		min = score1.min()
		if score2.min() < min:
		min = score2.min()
		i+=1
		ax.set_xlabel(r"$ t/t_{f}$",
		fontsize = font)
		ax.set_ylabel(r"$1 - e_{ML}/e_{BA}$",
		fontsize = font)
		ax.axhline(y=0,xmin=0,xmax=1,color='k',linestyle="--")
		ax.set_xlim(0,1)
		ax.set_ylim(min-0.1,1)
		ax.tick_params(axis='both', which='both', labelsize=font)
		ax.legend(loc="lower left",fontsize = font,
		frameon=False)

		plt.savefig("score_"+method+"-"+reference+".pdf",format="pdf")

		#plt.show()
		return

		if __name__ == '__main__':

		score_plot(method="median",reference="static")

fem/fig-6.py

0 → 100644

+119 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		To Do
		- Sort functions
		- rewrite dynamic functions by using masked arrays
		- clear definition of median in example samples prediction
		- calculate average and median for stupid predictors in functions
		@author: Stefan Hiemer
		"""
		import glob as glob

		import numpy as np
		import json
		import matplotlib.pyplot as plt
		from itertools import product

		def compare_hyperparameters():

		datasets = ["k1","stress0.7","stress0.9"]

		colors = ['#377eb8', '#ff7f00', '#4daf4a',
		'#f781bf', '#a65628', '#984ea3',
		'#999999', '#e41a1c', '#dede00']
		font=30
		col = {}
		i = 0
		for dataset in datasets:
		col[dataset] = colors[i]
		i = i +1

		fig,ax = plt.subplots(1,2,figsize=(15,10),
		sharey=True)
		for factor in product(datasets,[4],[50,100,200]):
		print(factor[0],factor[1],factor[2])
		files = glob.glob("metrics/*X-"+str(factor[2])+"-width-"+str(factor[1])+".h5-csv_"+factor[0]+"-michael-metrics.json")
		print(files)
		file = files[0]
		with open(file,'r') as f:
		metrics = json.load(f)

		with open("metrics/csv_"+factor[0]+"-var.json",'r') as f:
		var = json.load(f)


		ax[0].scatter(factor[2],
		1-metrics["l2"]/var,
		color=col[factor[0]],
		s=200)

		for factor in product(datasets,[2,4,8,32],[100]):
		print(factor[0],factor[1],factor[2])
		files = glob.glob("metrics/*X-"+str(factor[2])+"-width-"+str(factor[1])+".h5-csv_"+factor[0]+"-michael-metrics.json")
		print(files)
		file = files[0]
		with open(file,'r') as f:
		metrics = json.load(f)

		with open("metrics/csv_"+factor[0]+"-var.json",'r') as f:
		var = json.load(f)


		ax[1].scatter(factor[1],
		1-metrics["l2"]/var,
		color=col[factor[0]],
		s=200)

		lines = ax[0].get_lines()

		ax[0].set_xlabel("rate parameter "+r"$N$",fontsize=font+6)
		ax[0].set_ylabel("" + r"$R^{2}$",fontsize=font+6)
		ax[1].set_xlabel("shear zone width",fontsize=font+6)
		#ax[1].set_ylabel(r"$R^{2}$",fontsize=font)

		#ax.set_yticks()
		#ax.grid(axis='y')
		ax[0].set_xticks([50,100,200])
		ax[1].set_xticks([2,4,8,32])
		#ax.set_yticks(np.arange(-0.1,1.1,0.1))
		ax[0].set_ylim(top=1)
		ax[1].set_ylim(top=1)

		ax[0].tick_params(axis='both',which='both',
		labelsize=font)
		ax[1].tick_params(axis='both',which='both',
		labelsize=font)
		plt.subplots_adjust(wspace=0.1)

		legends = []
		for dataset in datasets:
		if dataset[0] == "k":
		legends.append(dataset[1:]+" , 0.7")
		else:
		legends.append("4 , "+dataset[6:])

		leg = ax[1].legend(legends,
		loc="center left",
		bbox_to_anchor=(1, 0.5),
		fontsize=font,
		frameon=False,
		title="k , load",
		title_fontsize=font,
		borderpad=0)

		for i in range(len(datasets)):
		leg.legendHandles[i].set_color(col[datasets[i]])
		leg._legend_box.align = "right"
		plt.savefig("hyperparam-R2.pdf",
		format='pdf',
		dpi=100,
		bbox_inches='tight')

		plt.show()

		return

		if __name__ == '__main__':

		compare_hyperparameters()