Commit 33d2e2cd authored by Stefan Hiemer's avatar Stefan Hiemer
Browse files

Added all code plus docu.

parent d253a189
Loading
Loading
Loading
Loading

fem/apply_model.py

0 → 100644
+1488 −0

File added.

Preview size limit exceeded, changes collapsed.

fem/extract_data.py

0 → 100644
+347 −0
Original line number Diff line number Diff line

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 16:29:41 2020

- replace the element list and start stop routine by np split


@author: shiemer
"""

import h5py
import numpy as np 
import os
import glob 
from functools import partial 
from itertools import product 

def soumya_style_postprocessing(path, h5file, dataset=None, pattern='*.csv'):
    """
    path: str, where the FEM data lies
    h5file: name of hdf5 file, we prefer "soumya.h5"
    dataset: str, see the list at the bottom of this file,
    pattern: str, pattern to find files. Should be left unchanged
    
    """
    home = os.getcwd()
    os.chdir(path)

    with h5py.File(home+'/' + h5file, 'r+') as h5file:

        files = glob.glob(pattern)


        if not dataset:
            dataset = list(h5file.keys())[0]

        # filter out files, which already have been included
        def recover_filename(key):
            return key + ".csv"

        if dataset in h5file.keys():
            keys = list(h5file[dataset].keys())
            already_read = list(map(recover_filename,keys))
            files = list(set(files) - set(already_read))
        else:
            h5file.create_group(dataset)

        i = 1

        for file in files:
            print(file)
            print(i,len(files))

            name = file.rsplit('.',1)[0]

            i = i + 1
            #drop first row as it is always zero
            data = np.loadtxt(file, dtype=float, delimiter = ',', skiprows = 1)[1:,1:]

            # get time and delete entries with same values
            time = data[:,-1]
            time = np.cumsum(time)

            # index end denotes the index, where an avalanche stops
            time,index_end = np.unique(time, return_counts=True)
            index_end = np.cumsum(index_end)

            index_start = index_end[:-2]
            index_start = np.insert(index_start,0,0)

            time_to_failure = time[-1] - time
            element_list = data[:,1]

            def event_history(start,end,element_list,size):
                """
                start: scalar integer
                end: scalar integer
                element_list: list containing indices of failed elements
                size: size of system

                Counts every entry in the renew history with an index between
                start and end and returns the avalanche size. It also generates
                a numpy array with the same shape as the simulation grid. This
                array contains the number of renewal events per element of the
                simulation grid
                """

                elements = element_list[start:end]

                avalanche_size = np.size(elements)
                damage_snapshot = np.zeros((int(size**2),))
                element_index, num_events = np.unique(elements, return_counts=True)
                damage_snapshot[element_index.astype(int)] = num_events

                return avalanche_size, damage_snapshot

            system_size = 64
            get_stuff = partial(event_history,
                                element_list = element_list,
                                size = system_size)

            event_information = np.array(list(map(get_stuff,
                                                  index_start,
                                                  index_end)))

            avalanche_size = event_information[:,0]
            damage_snapshots = np.stack(event_information[:,1])
            damage_snapshots = np.cumsum(damage_snapshots,axis=0)
            damage_snapshots = damage_snapshots.reshape((
                                                  np.shape(damage_snapshots)[0],
                                                  system_size,system_size))
            rowcol_sum = np.append(np.sum(damage_snapshots,axis=1),
                                   np.sum(damage_snapshots,axis=2),axis=-1)

            min = np.min(rowcol_sum, axis = -1)
            max = np.max(rowcol_sum, axis = -1)

            times = np.transpose(np.stack((time, time_to_failure)))
            system_information = np.transpose(np.stack((avalanche_size, max, min)))

            h5file.create_dataset(dataset + '/' + name + '/times',
                               data=times.astype(float),
                               compression="gzip",
                               chunks=True)
            h5file.create_dataset(dataset + '/' + name + '/system',
                               data=system_information.astype(int),
                               compression="gzip",
                               chunks=True)


    os.chdir(home)
    return

def row_damage_postproc(h5file="row-new.h5",
                        path=".",
                        dataset=None,
                        pattern='*.csv'):
    """
    Creates a hdf5 file used for calculating other features. See Michael_features.
    
    path: str, where the fem data is located
    h5file: str, name of h5file to store in
    dataset: see collection at the bottom of this file
    pattern: see documentation of function above.
    
    """
    home = os.getcwd()
    os.chdir(path)

    with h5py.File(home+'/' + h5file, 'r+') as h5file:

        files = glob.glob(pattern)


        if not dataset:
            dataset = list(h5file.keys())[0]

        # filter out files, which already have been included
        def recover_filename(key):
            return key + ".csv"

        if dataset in h5file.keys():
            keys = list(h5file[dataset].keys())
            already_read = list(map(recover_filename,keys))
            files = list(set(files) - set(already_read))
        else:
            h5file.create_group(dataset)

        i = 1

        for file in files:
            print(file)
            print(i,len(files))

            name = file.rsplit('.',1)[0]

            i = i + 1
            #drop first row as it is always zero
            data = np.loadtxt(file, dtype=float, delimiter = ',', skiprows = 1)[1:,1:]

            # get time and delete entries with same values
            time = data[:,-1]
            time = np.cumsum(time)

            # index end denotes the index, where an avalanche stops
            time,index_end = np.unique(time, return_counts=True)
            index_end = np.cumsum(index_end)

            index_start = index_end[:-2]
            index_start = np.insert(index_start,0,0)

            time_to_failure = time[-1] - time
            element_list = data[:,1]

            def event_history(start,end,element_list,size):
                """
                start: scalar integer
                end: scalar integer
                element_list: list containing indices of failed elements
                size: size of system

                Counts every entry in the renew history with an index between
                start and end and returns the avalanche size. It also generates
                a numpy array with the same shape as the simulation grid. This
                array contains the number of renewal events per element of the
                simulation grid
                """

                elements = element_list[start:end]

                avalanche_size = np.size(elements)
                damage_snapshot = np.zeros((int(size**2),))
                element_index, num_events = np.unique(elements, return_counts=True)
                damage_snapshot[element_index.astype(int)] = num_events

                return avalanche_size, damage_snapshot

            system_size = 64
            get_stuff = partial(event_history,
                                element_list = element_list,
                                size = system_size)

            event_information = np.array(list(map(get_stuff,
                                                  index_start,
                                                  index_end)))

            avalanche_size = event_information[:,0]
            damage_snapshots = np.stack(event_information[:,1])
            # calculate the cumulative damage snapshot
            damage_snapshots = np.cumsum(damage_snapshots,axis=0)
            damage_snapshots = damage_snapshots.reshape((
                                                  np.shape(damage_snapshots)[0],
                                                  system_size,system_size))
            rowcol_sum = np.append(np.sum(damage_snapshots,axis=1),
                                   np.sum(damage_snapshots,axis=2),axis=-1)

            # sort according to damage

            times = np.column_stack((time,time_to_failure))
            system_information = np.column_stack((avalanche_size, rowcol_sum)) 

            h5file.create_dataset(dataset + '/' + name + '/times',
                               data=times.astype(float),
                               compression="gzip",
                               chunks=True)
            h5file.create_dataset(dataset + '/' + name + '/system',
                               data=system_information.astype(int),
                               compression="gzip",
                               chunks=True)


    os.chdir(home)
    return 

def Michael_features(h5file, datasets,X=100, width = 8):
    """
    h5file: str, name of h5file 
    """
    if not os.path.isfile(h5file):
        with h5py.File(h5file,"w") as datafile:
            pass

    with h5py.File(h5file,"a") as datafile:

        for dataset in datasets:
            with h5py.File("row-new.h5","r") as source:

                events = 0
                for key in source[dataset].keys():

                    events += source[dataset+"/"+key+"/times"].shape[0]

                N = events/len(source[dataset].keys())
                nav = np.floor(N/X).astype(int)
                for key in source[dataset].keys():

                    # calculate rate
                    rate = nav/source[dataset+"/"+key+"/times"][:nav+1,0]
                    dt = source[dataset+"/"+key+"/times"][nav+1:,0] -\
                         source[dataset+"/"+key+"/times"][:-nav-1,0]
                    rate = np.append(rate, nav/dt)

                    # average avalanche amplitude
                    avg_avalanche = np.cumsum(source[dataset+"/"+key+"/system"][:nav-1,0])\
                                    /np.arange(1,nav)
                    avg_avalanche = np.append(avg_avalanche,
                                              np.convolve(source[dataset+"/"+key+"/system"][:,0],
                                                          np.ones(nav)/nav,
                                                          mode='valid'))

                    # cumulative avalanche sum
                    cum_damage = np.cumsum(source[dataset+"/"+key+"/system"][:,0])

                    # shear band features
                    windowing = partial(np.convolve,
                                        v=np.ones(width),
                                        mode="valid")
                    # sum damage over all row sums
                    row = np.apply_along_axis(windowing, axis=1,
                                arr=source[dataset+"/"+key+"/system"][:,1:65])
                    # sum damage over all column sums
                    col = np.apply_along_axis(windowing, axis=1,
                                arr=source[dataset+"/"+key+"/system"][:,65:])

                    shear_zone = np.max(np.concatenate((row,col),axis=1),
                                        axis=1)

                    time_information = source[dataset+"/"+key+"/times"][:]
                    time_information = np.insert(time_information,1,rate,axis=1)
                    system_information = np.column_stack((avg_avalanche,
                                                          cum_damage,
                                                          shear_zone))
                    datafile.create_dataset(dataset + '/' + key + '/times',
                                   data=time_information, dtype=float,
                                   compression="gzip",
                                   chunks=True)
                    datafile.create_dataset(dataset + '/' + key + '/system',
                                   data=system_information,dtype=float,
                                   compression="gzip",
                                   chunks=True)

    return

if __name__ == '__main__':
    
    #---- load simulation -----
    for ind in [2,1]:
        
        datasets = ["csv_k1", "csv_k2", "csv_k8", "csv_k16",
                    "csv_stress0.7","csv_stress0.9"]
        for dataset in datasets: 
            row_damage_postproc(h5file="row-new.h5", dataset=dataset,
                                pattern='*.csv', test = False)
            
    for w in [2,4,8,32]: 
        Michael_features(h5file="michael-X-100-width-"+str(w)+".h5",
                     datasets = ["csv_k1",
                                 "csv_stress0.7","csv_stress0.9"],
                     X=100, width = w)
    
    for x in [50,100,200]: 
        Michael_features(h5file="michael-X-"+str(x)+"-width-4.h5",
                     datasets = ["csv_k1",
                                 "csv_stress0.7","csv_stress0.9"],
                     X=x, width = 4)

fem/fig-3.py

0 → 100644
+133 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
To Do
- Sort functions
- rewrite dynamic functions by using masked arrays
- clear definition of median in example samples prediction
- calculate average and median for stupid predictors in functions
@author: Stefan Hiemer
"""

import os
from itertools import product
from math import ceil

import numpy as np
import matplotlib.pyplot as plt

import h5py

from baseline_predictions import get_data

def error_plot(datasets=["csv_k1",
                         "csv_k2",
                         "csv_stress0.7",
                         "csv_k8",
                         "csv_k16",
                         "csv_stress0.9"],
               methods=["mean","median"],
               references=["static","dynamic"]):

    colors = plt.get_cmap("tab10")
    fig,axs = plt.subplots(int(ceil(len(datasets)/2)),2,
                           sharex=True,
                           figsize=(15,10))
    font=26
    n_bins=100
    ybase = dict()

    home = os.getcwd()

    i = 0
    for dataset in datasets:
        print(dataset.split("_")[1])

        #
        with h5py.File("predictions.h5","r") as f:

            # time
            time=f[dataset+"/normalized-time"][:,0]

            for method,reference in product(methods,references):
                # baseline
                ybase['-'.join([method,reference])]=f[dataset+"/time-to-failure_"+'-'.join([method,reference])][:,0]
                
        xtrain, xtest, ytrain, ytest = get_data(dataset = dataset,
                                                h5file = "soumya-data.h5",
                                                train_size = 1000,
                                                max_size = 20000,
                                                features = "soumya",
                                                window = None)

        os.chdir(home)

        # lifetimes for normilzation
        lifetimes = np.array([y[0] for y in ytest])
        length = np.array([y.shape[0] for y in ytest])
        lifetimes = np.repeat(lifetimes,length)
        ytest = np.hstack(ytest)

        # linear binning prep
        bins = np.linspace(0, 1 - 1/n_bins, n_bins).tolist()
        indices = np.digitize(time, bins)
        bin_time = (np.linspace(0, 1 - 1/n_bins, n_bins) +\
                    np.linspace(0 + 1/n_bins, 1, n_bins))/2
        masks = [indices==int(l+1) for l in range(len(bins))]

        row,col = int(np.floor(i/2)),i%2
        j = 0
        for method,reference in product(methods,references):

            # calculate error
            error = np.abs(ybase['-'.join([method,reference])] - ytest)/lifetimes

            # binning
            error = np.array([np.mean(error[mask]) for mask in masks])

            #
            axs[row,col].plot(bin_time,error,"-",color=colors(j),
                              label='-'.join([method,reference]))

            j += 1

        if dataset[4]=="k":
            title = "k "+dataset.split("_")[1][1:]+", load 0.7"
        else:
            title = "k 4, load "+dataset.split("_")[1][6:]

        axs[row,col].set_title(title,fontsize=font)

        if row == int(ceil(len(datasets)/2))-1:
            axs[row,col].set_xlabel(r"$t/t_{f}$",
                      fontsize = font+6)
        if col == 0:
            axs[row,col].set_ylabel(r"$|\hat{t}-t_{a}|/t_{f}$",
                          fontsize = font+6)

        axs[row,col].set_xlim(0,1)
        #axs[row,col].set_ylim(bottom=0)

        axs[row,col].tick_params(axis='both',
                                 which='both',
                                 labelsize=font)

        i+=1

    h,l = axs[int(ceil(len(datasets)/2))-1,0].get_legend_handles_labels()
    axs[0,0].legend(h,l,fontsize = font,
                     loc='upper center',
                     bbox_to_anchor=(1.0, 1.6),
                     frameon=False,
                     ncol=4)

    plt.savefig("baseline-error-plot.pdf",
                format="pdf",
                bbox_inches="tight")

    plt.show()
    return

if __name__ == '__main__':

    error_plot()

fem/fig-5.py

0 → 100644
+109 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
To Do
- Sort functions
- rewrite dynamic functions by using masked arrays
- clear definition of median in example samples prediction
- calculate average and median for stupid predictors in functions
@author: Stefan Hiemer
"""

import os

import numpy as np
import matplotlib.pyplot as plt

import h5py

from get_data import get_data

def score_plot(datasets=["csv_k1","csv_stress0.7","csv_stress0.9"],
               method="median",reference="static"):

    colors = plt.get_cmap("tab10")
    fig,ax = plt.subplots(1,1,figsize=(15,10))
    n_bins=100
    font=30
    min = 0

    home = os.getcwd()

    i = 0
    for dataset in datasets:

        #
        with h5py.File("predictions.h5","r") as f:

            # time
            time=f[dataset+"/normalized-time"][:,0]

            # baseline
            ybase=f[dataset+"/time-to-failure_"+'-'.join([method,reference])][:,0]

            # prediction 1
            ypred1=f[dataset+"/time-to-failure_michael-X-100-width-4"][:,0]

            # prediction 2
            ypred2=f[dataset+"/time-to-failure_soumya-data"][:,0]
            
        xtrain, xtest, ytrain, ytest = get_data(dataset = dataset,
                                                h5file = "soumya-data.h5",
                                                train_size = 1000,
                                                max_size = 20000,
                                                features = "soumya",
                                                window = None)
        os.chdir(home)
        ytest = np.hstack(ytest)
        # calculate error
        error1 = np.abs(ypred1 - ytest)
        error2 = np.abs(ypred2 - ytest)
        base_error = np.abs(ybase - np.hstack(ytest))

        # linear binning
        bins = np.linspace(0, 1 - 1/n_bins, n_bins).tolist()
        indices = np.digitize(time, bins)
        bin_time = (np.linspace(0, 1 - 1/n_bins, n_bins) +\
                    np.linspace(0 + 1/n_bins, 1, n_bins))/2
        masks = [indices==int(l+1) for l in range(len(bins))]
        error1 = np.array([np.mean(error1[mask]) for mask in masks])
        error2 = np.array([np.mean(error2[mask]) for mask in masks])
        base_error = np.array([np.mean(base_error[mask]) for mask in masks])

        if dataset.split("_")[1]=="k1":
            label = r"k 1, load 0.7"
        elif dataset.split("_")[1]=="stress0.7":
            label = r"k 4, load 0.7"
        elif dataset.split("_")[1]=="stress0.9":
            label = r"k 4, load 0.9"
        #
        score1 = 1- error1/base_error
        score2 = 1- error2/base_error
        ax.plot(bin_time,score1,"-",color=colors(i),
                label=label)
        ax.plot(bin_time,score2,"--",color=colors(i))

        if score1.min() < min:
            min = score1.min()
        if score2.min() < min:
            min = score2.min()
        i+=1
    ax.set_xlabel(r"$ t/t_{f}$",
                  fontsize = font)
    ax.set_ylabel(r"$1 - e_{ML}/e_{BA}$",
                  fontsize = font)
    ax.axhline(y=0,xmin=0,xmax=1,color='k',linestyle="--")
    ax.set_xlim(0,1)
    ax.set_ylim(min-0.1,1)
    ax.tick_params(axis='both', which='both', labelsize=font)
    ax.legend(loc="lower left",fontsize = font,
              frameon=False)

    plt.savefig("score_"+method+"-"+reference+".pdf",format="pdf")

    #plt.show()
    return

if __name__ == '__main__':

    score_plot(method="median",reference="static")

fem/fig-6.py

0 → 100644
+119 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
To Do
- Sort functions
- rewrite dynamic functions by using masked arrays
- clear definition of median in example samples prediction
- calculate average and median for stupid predictors in functions
@author: Stefan Hiemer
"""
import glob as glob

import numpy as np
import json
import matplotlib.pyplot as plt
from itertools import product

def compare_hyperparameters():

    datasets = ["k1","stress0.7","stress0.9"]

    colors = ['#377eb8', '#ff7f00', '#4daf4a',
              '#f781bf', '#a65628', '#984ea3',
              '#999999', '#e41a1c', '#dede00']
    font=30
    col = {}
    i = 0
    for dataset in datasets:
        col[dataset] = colors[i]
        i = i +1

    fig,ax = plt.subplots(1,2,figsize=(15,10),
                          sharey=True)
    for factor in product(datasets,[4],[50,100,200]):
        print(factor[0],factor[1],factor[2])
        files = glob.glob("metrics/*X-"+str(factor[2])+"-width-"+str(factor[1])+".h5-csv_"+factor[0]+"-michael-metrics.json")
        print(files)
        file = files[0]
        with open(file,'r') as f:
            metrics = json.load(f)

        with open("metrics/csv_"+factor[0]+"-var.json",'r') as f:
            var = json.load(f)


        ax[0].scatter(factor[2],
                   1-metrics["l2"]/var,
                   color=col[factor[0]],
                   s=200)

    for factor in product(datasets,[2,4,8,32],[100]):
        print(factor[0],factor[1],factor[2])
        files = glob.glob("metrics/*X-"+str(factor[2])+"-width-"+str(factor[1])+".h5-csv_"+factor[0]+"-michael-metrics.json")
        print(files)
        file = files[0]
        with open(file,'r') as f:
            metrics = json.load(f)

        with open("metrics/csv_"+factor[0]+"-var.json",'r') as f:
            var = json.load(f)


        ax[1].scatter(factor[1],
                   1-metrics["l2"]/var,
                   color=col[factor[0]],
                   s=200)

    lines = ax[0].get_lines()

    ax[0].set_xlabel("rate parameter "+r"$N$",fontsize=font+6)
    ax[0].set_ylabel("" + r"$R^{2}$",fontsize=font+6)
    ax[1].set_xlabel("shear zone width",fontsize=font+6)
    #ax[1].set_ylabel(r"$R^{2}$",fontsize=font)

    #ax.set_yticks()
    #ax.grid(axis='y')
    ax[0].set_xticks([50,100,200])
    ax[1].set_xticks([2,4,8,32])
    #ax.set_yticks(np.arange(-0.1,1.1,0.1))
    ax[0].set_ylim(top=1)
    ax[1].set_ylim(top=1)

    ax[0].tick_params(axis='both',which='both',
                       labelsize=font)
    ax[1].tick_params(axis='both',which='both',
                       labelsize=font)
    plt.subplots_adjust(wspace=0.1)

    legends = []
    for dataset in datasets:
        if dataset[0] == "k":
            legends.append(dataset[1:]+" , 0.7")
        else:
            legends.append("4 , "+dataset[6:])

    leg = ax[1].legend(legends,
                       loc="center left",
                       bbox_to_anchor=(1, 0.5),
                       fontsize=font,
                       frameon=False,
                       title="k , load",
                       title_fontsize=font,
                       borderpad=0)

    for i in range(len(datasets)):
        leg.legendHandles[i].set_color(col[datasets[i]])
    leg._legend_box.align = "right"
    plt.savefig("hyperparam-R2.pdf",
                format='pdf',
                dpi=100,
                bbox_inches='tight')

    plt.show()

    return

if __name__ == '__main__':

    compare_hyperparameters()
Loading