Commit 307ea861 authored by Elias's avatar Elias
Browse files

we add the bag of words implementation and a first neural network try

parent 16747c95
%% Cell type:markdown id: tags:
<a href="https://colab.research.google.com/github/Eliascc5/English_proficiency_prediction_NLP/blob/main/projet_IAS_partie2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
%% Cell type:code id: tags:
```
#As in the first stage -> we load our dataset from Drive
#Dataset: NICT_JLE_4.1
#Reference: https://alaginrc.nict.go.jp/nict_jle/index_E.html
from google.colab import drive
drive.mount("/content/gdrive")
#----------------------------------
```
%% Output
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
%% Cell type:code id: tags:
```
import os
path_test= r'/content/gdrive/MyDrive/NICT_JLE_4.1/Output/'
os.chdir(path_test)
list_vocabulary=[]
y_output=[]
for file in os.listdir():
if file.endswith(".txt"):
file_path = f"{path_test}/{file}"
with open(file_path, mode='r',encoding="utf8",errors='ignore') as f:
score = f.readline()
content = f.read()
list_vocabulary.append(content)
y_output.append(int(score))
f.close
######################################
```
%% Cell type:code id: tags:
```
text = list_vocabulary
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
#print(vector.shape)
#print(type(vector))
#print(vector.toarray())
```
%% Cell type:markdown id: tags:
extraemos los resultados
%% Cell type:markdown id: tags:
ACA ARRANCAMOS CON LA RED !!!!!!!!!!!!!!!
%% Cell type:code id: tags:
```
from keras import models
from keras import layers
from keras.utils import np_utils
import tensorflow as tf
import numpy as np
#######################################
n=1281 #candidatos
m=15157 #son las palabras sin repeticion del vocabulario
array_output = np.array(y_output)
array_output = np_utils.to_categorical(array_output)
array_output = array_output[:,1:10]
########################################
model = models.Sequential()
model.add(layers.Dense(7000, activation='sigmoid', input_shape=(m,)))
model.add(layers.Dense(200, activation='sigmoid'))
model.add(layers.Dense(50, activation='sigmoid'))
model.add(layers.Dense(9, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
array=vector.toarray()
x_val_array=array[:round(0.2*n)] # 20 porciento de los candidatos
y_val = array_output[:round(0.2*n)]
partial_x_train_array = array[round(0.2*n):]
partial_y_train_array = array_output[round(0.2*n):]
history = model.fit(partial_x_train_array,
partial_y_train_array,
epochs=40,
batch_size=300,
validation_data=(x_val_array, y_val))
#plot the results :
history_dict = history.history
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
```
%% Output
Epoch 1/40
1/4 [======>.......................] - ETA: 13s - loss: 3.0324 - accuracy: 0.0233
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-146-6bc6036f244e> in <module>()
42 epochs=40,
43 batch_size=300,
---> 44 validation_data=(x_val_array, y_val))
45
46 #plot the results :
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1156 _r=1):
1157 callbacks.on_train_batch_begin(step)
-> 1158 tmp_logs = self.train_function(iterator)
1159 if data_handler.should_sync:
1160 context.async_wait()
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
887
888 with OptionalXlaContext(self._jit_compile):
--> 889 result = self._call(*args, **kwds)
890
891 new_tracing_count = self.experimental_get_tracing_count()
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
915 # In this case we have created variables on the first call, so we run the
916 # defunned version which is guaranteed to never create variables.
--> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
918 elif self._stateful_fn is not None:
919 # Release the lock early so that multiple threads can perform the call
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3022 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3023 return graph_function._call_flat(
-> 3024 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3025
3026 @property
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1959 # No tape is watching; skip to running the function.
1960 return self._build_call_outputs(self._inference_function.call(
-> 1961 ctx, args, cancellation_manager=cancellation_manager))
1962 forward_backward = self._select_forward_and_backward_functions(
1963 args,
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
594 inputs=args,
595 attrs=attrs,
--> 596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
KeyboardInterrupt:
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment