import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn import metrics

import matplotlib.pyplot as plt
import altair as alt
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error [IFT]')
    plt.legend()
    plt.grid(True)
    
def plot_ift(x, y):
    plt.scatter(train_features['Water_content'], train_labels, label='Data')
    plt.plot(x, y, color='k', label='Predictions')
    plt.xlabel('Water_content')
    plt.ylabel('IFT')
    plt.legend()


# reading the data
ift_data = pd.read_excel('data/ift_data.xlsx')
# quick glmipse into the number of rows
print('Number of records in the dataset is ',len(ift_data))
# let's explore the data
ift_data.head()

Number of records in the dataset is  561


# how iFT changes with time
alt.Chart(ift_data, title = 'Change in IFT with water content over time for CH4 and CO2').mark_circle(size=60).encode(
    alt.X('time_minutes:Q', title = 'Time'),
    alt.Y('IFT:Q'),
    alt.Color('Gas:N'),
).interactive()


# how IFT changes with water_content
alt.Chart(ift_data, title = 'Change in IFT with water content over time for CH4 and CO2').mark_circle(size=60).encode(
    alt.X('Water_content:Q', title = 'Water Content'),
    alt.Y('IFT:Q'),
    alt.Color('Gas:N'),
).interactive()


x= ift_data.iloc[:,:4] # get x
x


y = ift_data.iloc[:,5] # get y
y

0      25.08
1      25.12
2      25.16
3      25.17
4      25.21
       ...  
556    19.93
557    19.95
558    19.87
559    19.92
560    19.86
Name: IFT, Length: 561, dtype: float64


# splitting the data into train and test model
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, shuffle=True)
# since we have numeric and categorical features we will create a column transformer to transform them seperately
X_train


def run_linear_model(X_train,train, viscosity = 'not_included'):
    
    if viscosity == 'not_included':
        numeric_features = ['Water_content', 'time_minutes',] 
    else:
         numeric_features = ['Water_content', 'time_minutes','viscosity']  
        
        

    # first transformer for the numeric features
        
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    # now a taransformer for the categorical features
    categorical_features = ['Gas']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    # creating a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    ridge_model = Ridge()
    # include the preprocessor and the model in one pipeline.
    # Now we have a full prediction pipeline.
    reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('Regressor', ridge_model)])

    # finally we will pass the pipe line to gridsearchcv to find the optimum paramters for the model
    param_grid = {
        'Regressor__alpha':[0.1,0.25,0.4],
    }
    search = GridSearchCV(reg_pipeline,param_grid,cv = 5)

    # fitting the model
    search.fit(X_train, y_train)

    # printing the first parameter
    print(search.best_params_)
    print("model score: %.3f" % search.score(X_test, y_test))
    return search


lm1 = run_linear_model(X_train,y_train,viscosity = 'not_included')

{'Regressor__alpha': 0.4}
model score: 0.515


lm2 = run_linear_model(X_train,y_train,viscosity = 'not_included')

{'Regressor__alpha': 0.4}
model score: 0.515


# let's look at he model paramters
model_intercept = lm.best_estimator_['Regressor'].intercept_
model_intercept

22.641914511039513


# The interpretability of linear model 
coeff_parameter = pd.DataFrame(lm.best_estimator_['Regressor'].coef_,columns=['Coefficient'])
coeff_parameter['models'] = np.array(['Gas','Water_content','viscosity','time_minutes'])
coeff_parameter
lm2.best_estimator_['Regressor'].coef_

array([-0.28426813, -0.15528172,  1.68815898, -1.68815898])


# let's evaluate the model peroformance using MSE and MAE


y_pred = lm2.predict(X_test)


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.0916569015462978
Mean Squared Error: 2.0925408260809752
Root Mean Squared Error: 1.4465617256380645


def run_gb_model(X_train,y_train, viscosity = 'not_included'):
    
    if viscosity == 'not_included':
        numeric_features = ['Water_content', 'time_minutes',] 
    else:
         numeric_features = ['Water_content', 'time_minutes','viscosity']    
    # first transformer for the numeric features    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    # now a taransformer for the categorical features
    categorical_features = ['Gas']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    # creating a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    gb_model = GradientBoostingRegressor()
    reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('Regressor', gb_model)])
    param_grid = {
        'Regressor__learning_rate':[0.1,0.25,0.4],
    }
    search = GridSearchCV(reg_pipeline,param_grid,cv = 5)

    # fitting the model
    search.fit(X_train, y_train)

    # printing the first parameter
    print(search.best_params_)
    print("model score: %.3f" % search.score(X_test, y_test))
    return search


gb = run_gb_model(X_train,y_train, viscosity = 'not_included')

{'Regressor__learning_rate': 0.4}
model score: 0.995


print("model score: %.3f" % gb.score(X_test, y_test))

model score: 0.995


y_pred = gb.predict(X_test)


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.07874543098019307
Mean Squared Error: 0.021835685391972143
Root Mean Squared Error: 0.14776902717407372


# prediction for a new value


new_data = X_test.iloc[[0]]
new_data


gb.predict(new_data)

array([19.1559994])


# How predictions compared with the actual results
X_test.loc[:,'true_ift'] = y_test[:]
X_test.loc[:,'predicted_ift'] = y_pred[:]
X_test


# let's visualize that
plt.figure(figsize=(10, 6), dpi=80)
plt.scatter(X_test['Water_content'],X_test['true_ift'], marker = 'x', label="True" )
plt.scatter(X_test['Water_content'],X_test['predicted_ift'],marker = 'o', alpha = 0.6, label="predicted")
plt.xlabel("Water Content")
plt.ylabel("IFT")
plt.legend(loc='lower left')
plt.title('Actual Vs Predicted IFT')
plt.show()


## saving thee model for future use :


from joblib import dump, load


dump(gb.best_estimator_, 'model.pkl')
model1 = load('model.pkl')

model1.predict(new_data)

array([19.1559994])


## Bulidng simple linear model with one feature: Water Content


ift_data = pd.read_excel('data/ift_data.xlsx')


len(ift_data)

561


X= ift_data[['Water_content']] # get x
y = ift_data.iloc[:,5] # get y


X.head()

y

0      25.08
1      25.12
2      25.16
3      25.17
4      25.21
       ...  
556    19.93
557    19.95
558    19.87
559    19.92
560    19.86
Name: IFT, Length: 561, dtype: float64


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, shuffle=True)

train_features = X_train[['Water_content']]
test_features = X_test[['Water_content']]
normalizer = preprocessing.Normalization()


train_labels = y_train
test_labels = y_test
features = np.array(X_train)
f_normalizer = preprocessing.Normalization(input_shape=[1,])
f_normalizer.adapt(features)
linear_model = tf.keras.Sequential([
    f_normalizer,
    layers.Dense(units=1)
])
linear_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_1 (Normalizati (None, 1)                 3         
_________________________________________________________________
dense (Dense)                (None, 1)                 2         
=================================================================
Total params: 5
Trainable params: 2
Non-trainable params: 3
_________________________________________________________________


linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_squared_error')


%%time
history = linear_model.fit(
    train_features, train_labels,
    epochs=100,
    # suppress logging
    verbose=0,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2)

CPU times: user 2.28 s, sys: 197 ms, total: 2.47 s
Wall time: 2.21 s


plot_loss(history)


test_results = {}

test_results['linear_model'] = linear_model.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713}


water_content = tf.linspace(0.0, 1, 251)
ift = linear_model.predict(water_content )
plot_ift(water_content,ift)


### Non-linear Model


def build_and_compile_model(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model


dnn_horsepower_model = build_and_compile_model(f_normalizer)
dnn_horsepower_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_1 (Normalizati (None, 1)                 3         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                128       
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
=================================================================
Total params: 4,356
Trainable params: 4,353
Non-trainable params: 3
_________________________________________________________________


%%time
history = dnn_horsepower_model.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

CPU times: user 2.72 s, sys: 412 ms, total: 3.13 s
Wall time: 2.31 s


plot_loss(history)


test_results['non_linear'] =dnn_horsepower_model.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713, 'non_linear': 3.3725452423095703}


water_content = tf.linspace(0.0, 1, 251)
ift = dnn_horsepower_model.predict(water_content )
plot_ift(water_content, ift)


ift_data = pd.read_excel('data/ift_data.xlsx')
X= ift_data[['Water_content','viscosity','time_minutes','Gas']] # get x
y = ift_data['IFT'] # get y

X


X = pd.get_dummies(X, prefix='', prefix_sep='')
X.head()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, shuffle=True)
train_features = X_train
test_features = X_test


normalizer = preprocessing.Normalization()
train_labels = y_train
test_labels = y_test
features = np.array(X_train)
ift_normalizer = preprocessing.Normalization(input_shape=[5,])
ift_normalizer.adapt(features)
ift_model = tf.keras.Sequential([
    ift_normalizer,
    layers.Dense(units=1)
])
ift_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_3 (Normalizati (None, 5)                 11        
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 6         
=================================================================
Total params: 17
Trainable params: 6
Non-trainable params: 11
_________________________________________________________________


all_features_model = build_and_compile_model(ift_normalizer)
all_features_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_3 (Normalizati (None, 5)                 11        
_________________________________________________________________
dense_5 (Dense)              (None, 64)                384       
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
=================================================================
Total params: 4,620
Trainable params: 4,609
Non-trainable params: 11
_________________________________________________________________


%%time
history = all_features_model.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

CPU times: user 2.7 s, sys: 415 ms, total: 3.11 s
Wall time: 2.29 s


plot_loss(history)


test_results['all_features'] =all_features_model.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713,
 'non_linear': 3.3725452423095703,
 'all_features': 0.9747514128684998}


### Deep Learning: The effect of adding more layers to the ANN


def build_and_compile_model_4(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model


def build_and_compile_model_6(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model


def build_and_compile_model_8(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model


### 4 layers


all_features_model_enhanced = build_and_compile_model_4(ift_normalizer)
all_features_model_enhanced.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_3 (Normalizati (None, 5)                 11        
_________________________________________________________________
dense_8 (Dense)              (None, 64)                384       
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
=================================================================
Total params: 8,780
Trainable params: 8,769
Non-trainable params: 11
_________________________________________________________________


%%time
history = all_features_model_enhanced.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=10000)

CPU times: user 4min 23s, sys: 1min, total: 5min 23s
Wall time: 3min 14s


test_results['all_features_enhanced'] =all_features_model_enhanced.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713,
 'non_linear': 3.3725452423095703,
 'all_features': 0.9747514128684998,
 'all_features_enhanced': 0.05865849554538727}


### 6 Layers: 
all_features_model_enhanced = build_and_compile_model_6(ift_normalizer)
all_features_model_enhanced.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_3 (Normalizati (None, 5)                 11        
_________________________________________________________________
dense_12 (Dense)             (None, 64)                384       
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_14 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_15 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_16 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
=================================================================
Total params: 17,100
Trainable params: 17,089
Non-trainable params: 11
_________________________________________________________________


%%time
history = all_features_model_enhanced.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=10000)

CPU times: user 5min 8s, sys: 1min 29s, total: 6min 37s
Wall time: 3min 32s


test_results['all_features_enhanced'] =all_features_model_enhanced.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713,
 'non_linear': 3.3725452423095703,
 'all_features': 0.9747514128684998,
 'all_features_enhanced': 0.06746986508369446}


### 8 Layers: 
all_features_model_enhanced = build_and_compile_model_8(ift_normalizer)
all_features_model_enhanced.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
normalization_3 (Normalizati (None, 5)                 11        
_________________________________________________________________
dense_18 (Dense)             (None, 64)                384       
_________________________________________________________________
dense_19 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_20 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_21 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_22 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_23 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_24 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 65        
=================================================================
Total params: 25,420
Trainable params: 25,409
Non-trainable params: 11
_________________________________________________________________


%%time
history = all_features_model_enhanced.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0, epochs=10000)

CPU times: user 5min 45s, sys: 1min 49s, total: 7min 35s
Wall time: 3min 46s


test_results['all_features_enhanced'] =all_features_model_enhanced.evaluate(
    test_features,
    test_labels, verbose=0)
test_results

{'linear_model': 4.279341220855713,
 'non_linear': 3.3725452423095703,
 'all_features': 0.9747514128684998,
 'all_features_enhanced': 0.06490664184093475}


# Best Model : All features with 4 layers


pd.DataFrame(test_results.items(), columns = ['Model','MSE'])

	Gas	Water_content	viscosity	time_minutes
204	CH4	0.33	61030.55556	630.0
558	CO2	0.70	236837.08330	180.0
514	CO2	0.10	35930.00000	915.0
252	CH4	0.45	74375.55556	555.0
164	CH4	0.20	45845.00000	660.0
...	...	...	...	...
98	CH4	0.10	35930.00000	615.0
322	CH4	0.50	86649.44444	780.0
382	CH4	0.70	236837.08330	735.0
365	CH4	0.70	236837.08330	480.0
510	CO2	0.10	35930.00000	855.0

	Gas	Water_content	viscosity	time_minutes	true_ift	predicted_ift
466	CO2	0.10	35930.00000	195.0	19.12	19.155999
157	CH4	0.20	45845.00000	555.0	24.38	24.409911
452	CO2	0.00	35930.00000	930.0	23.80	23.903846
449	CO2	0.00	35930.00000	885.0	23.81	23.763949
467	CO2	0.10	35930.00000	210.0	19.09	19.120444
...	...	...	...	...	...	...
508	CO2	0.10	35930.00000	825.0	18.91	18.897314
374	CH4	0.70	236837.08330	615.0	25.08	25.193992
181	CH4	0.20	45845.00000	915.0	24.54	24.562454
485	CO2	0.10	35930.00000	480.0	18.91	18.949683
200	CH4	0.33	61030.55556	510.0	23.85	23.838603

Case Study: Modeling IFT and Volume Expansion for Gas Injection in Heavy Oil Reservoirs¶

OUTLINE¶

Problem Statement¶

Methodology¶

Case Study: Prediction of IFT and Volume Ratio¶

Models : We will be testing three models:¶

Modeling¶

Model Selection : Trying Gradient boosting¶

ANN¶

Adding More Features:¶

Results:¶

Conclusions:¶

	Gas	viscosity	time_minutes	volume_ratio	IFT
0	CH4	27000.0	0.000050	1.000000	25.08
1	CH4	27000.0	14.998333	1.002618	25.12
2	CH4	27000.0	30.000000	1.005236	25.16
3	CH4	27000.0	45.000000	1.006108	25.17
4	CH4	27000.0	60.000000	1.007853	25.21

	Gas	Water_content	viscosity	time_minutes
0	CH4	0.0	27000.0000	0.000050
1	CH4	0.0	27000.0000	14.998333
2	CH4	0.0	27000.0000	30.000000
3	CH4	0.0	27000.0000	45.000000
4	CH4	0.0	27000.0000	60.000000
...	...	...	...	...
556	CO2	0.7	236837.0833	150.000000
557	CO2	0.7	236837.0833	165.000000
558	CO2	0.7	236837.0833	180.000000
559	CO2	0.7	236837.0833	195.000000
560	CO2	0.7	236837.0833	210.000000

	Model	MSE
0	linear_model	4.279341
1	non_linear	3.372545
2	all_features	0.974751
3	all_features_enhanced	0.064907