Commit bff26901 authored by Elias's avatar Elias
Browse files

Repartition des données et on ajoute la prediction

parent 307ea861
%% Cell type:markdown id: tags:
<a href="https://colab.research.google.com/github/Eliascc5/English_proficiency_prediction_NLP/blob/main/projet_IAS_partie2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
%% Cell type:code id: tags:
```
#As in the first stage -> we load our dataset from Drive
#Dataset: NICT_JLE_4.1
#Reference: https://alaginrc.nict.go.jp/nict_jle/index_E.html
from google.colab import drive
drive.mount("/content/gdrive")
#----------------------------------
```
%% Output
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
%% Cell type:code id: tags:
```
import os
path_test= r'/content/gdrive/MyDrive/NICT_JLE_4.1/Output/'
os.chdir(path_test)
list_vocabulary=[]
vocabulary=[]
y_output=[]
for file in os.listdir():
if file.endswith(".txt"):
file_path = f"{path_test}/{file}"
with open(file_path, mode='r',encoding="utf8",errors='ignore') as f:
score = f.readline()
content = f.read()
list_vocabulary.append(content)
vocabulary.append(content)
y_output.append(int(score))
f.close
######################################
```
%% Cell type:markdown id: tags:
**NOTE:** We create a general vocabulary from the transcription of all the candidates so that it is as general as possible.
Basically the "vocabulary" list contains in each element the transcription of each candidate. In other words -> len (vocabulary) = 1281
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
%% Cell type:code id: tags:
```
text = list_vocabulary
#TODO: il faut nettoyer
from sklearn.feature_extraction.text import CountVectorizer
# create the transform
# Convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
vectorizer.fit(vocabulary) #(Learn a vocabulary dictionary of all tokens in the raw documents.)
# summarize
print(vectorizer.vocabulary_)
#print(vectorizer.vocabulary_)
#print(vectorizer.get_feature_names())
# encode document
vector = vectorizer.transform(text)
vector = vectorizer.transform(vocabulary)
# summarize encoded vector
#print(vector.shape)
#print(type(vector))
#print(vector.toarray())
```
%% Cell type:markdown id: tags:
extraemos los resultados
%% Cell type:markdown id: tags:
ACA ARRANCAMOS CON LA RED !!!!!!!!!!!!!!!
**Here we start with the Neural Network**
%% Cell type:code id: tags:
```
from keras import models
from keras import layers
from keras.utils import np_utils
import tensorflow as tf
import numpy as np
#######################################
n=1281 #candidatos
m=15157 #son las palabras sin repeticion del vocabulario
n = len(vocabulary) #Number of candidates 1281
m = 15157 #Number of words without repetition of our vocabulary
array=vector.toarray() #INPUT
array_output = np.array(y_output)
array_output = np.array(y_output) #OUTPUT
array_output = np_utils.to_categorical(array_output)
array_output = array_output[:,1:10]
########################################
model = models.Sequential()
model.add(layers.Dense(7000, activation='sigmoid', input_shape=(m,)))
model.add(layers.Dense(200, activation='sigmoid', input_shape=(m,)))
model.add(layers.Dense(400, activation='sigmoid'))
model.add(layers.Dense(200, activation='sigmoid'))
model.add(layers.Dense(50, activation='sigmoid'))
model.add(layers.Dense(9, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
array=vector.toarray()
x_val_array=array[:round(0.2*n)] # 20 porciento de los candidatos
y_val = array_output[:round(0.2*n)]
######################################################################
partial_x_train_array = array[round(0.2*n):]
partial_y_train_array = array_output[round(0.2*n):]
history = model.fit(partial_x_train_array,
partial_y_train_array,
epochs=40,
batch_size=300,
#Repartition des donnees
randomGenerator = np.random.RandomState(0) #Generateur de nombres aleatoires
nbIndividus = len(vocabulary) #Nombre d'individus presents dans la base de donnees
randomIndexes = np.arange(nbIndividus)#Creation d'une liste d'index
randomGenerator.shuffle(randomIndexes)#Melange de cette liste
#Utilisation de cette liste d'index pour repartir les donnees en differents jeux
#Apprentissage : 60%
x_train_array = array[randomIndexes[0:int(len(vocabulary)*0.6)]]
y_train_array = array_output[randomIndexes[0:int(len(vocabulary)*0.6)]]
#Validation : 20 %
x_val_array = array[randomIndexes[int(len(vocabulary)*0.6):int(len(vocabulary)*0.8)]]
y_val = array_output[randomIndexes[int(len(vocabulary)*0.6):int(len(vocabulary)*0.8)]]
#Test : 20%
x_test_array = array[randomIndexes[int(len(vocabulary)*0.8):]]
y_test = array_output[randomIndexes[int(len(vocabulary)*0.8):]]
history = model.fit(x_train_array,
y_train_array,
epochs=50,
batch_size=512,
validation_data=(x_val_array, y_val))
#plot the results :
history_dict = history.history
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
```
%% Output
%% Cell type:markdown id: tags:
**TODO:** add prediction stage; Network performance tests **IN PROCESS**
%% Cell type:markdown id: tags:
Epoch 1/40
1/4 [======>.......................] - ETA: 13s - loss: 3.0324 - accuracy: 0.0233
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-146-6bc6036f244e> in <module>()
42 epochs=40,
43 batch_size=300,
---> 44 validation_data=(x_val_array, y_val))
45
46 #plot the results :
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1156 _r=1):
1157 callbacks.on_train_batch_begin(step)
-> 1158 tmp_logs = self.train_function(iterator)
1159 if data_handler.should_sync:
1160 context.async_wait()
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
887
888 with OptionalXlaContext(self._jit_compile):
--> 889 result = self._call(*args, **kwds)
890
891 new_tracing_count = self.experimental_get_tracing_count()
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
915 # In this case we have created variables on the first call, so we run the
916 # defunned version which is guaranteed to never create variables.
--> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
918 elif self._stateful_fn is not None:
919 # Release the lock early so that multiple threads can perform the call
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3022 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3023 return graph_function._call_flat(
-> 3024 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3025
3026 @property
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1959 # No tape is watching; skip to running the function.
1960 return self._build_call_outputs(self._inference_function.call(
-> 1961 ctx, args, cancellation_manager=cancellation_manager))
1962 forward_backward = self._select_forward_and_backward_functions(
1963 args,
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
594 inputs=args,
595 attrs=attrs,
--> 596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
KeyboardInterrupt:
%% Cell type:code id: tags:
```
##IN PROCESS....
predictions= model.predict(x_test_array)
for i in range(10):
print("Prediction: ", np.argmax(predictions[i])+1) #add +1 to start in 1
print("Label of x_test", y_test[i])
print("------------------")
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment