diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/NER_BERT/LICENSE b/NER_BERT/LICENSE
new file mode 100644
index 0000000..642dede
--- /dev/null
+++ b/NER_BERT/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Atul Kumar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NER_BERT/README.md b/NER_BERT/README.md
new file mode 100644
index 0000000..0f0b7d0
--- /dev/null
+++ b/NER_BERT/README.md
@@ -0,0 +1 @@
+# WIP: BERT vs BiLSTM NER
diff --git a/NER_BERT/bert_data_util.py b/NER_BERT/bert_data_util.py
new file mode 100644
index 0000000..3acc676
--- /dev/null
+++ b/NER_BERT/bert_data_util.py
@@ -0,0 +1,92 @@
+from __future__ import absolute_import, division, print_function
+
+import os
+
+from transformers import BertTokenizer
+
+import torch
+import torch.nn as nn
+
+import const
+
+bert_tokenizer = BertTokenizer.from_pretrained(const.MODEL_TYPE, do_lower_case=True)
+
+def get_bert_data(examples, tag2id, config):
+    bert_data = []
+    for orig_tokens, orig_tag in examples:
+        input_ids, label_ids, segment_ids, tokens = prepare_bert_input(orig_tokens, orig_tag, tag2id, config)
+        bert_data.append((input_ids, label_ids, segment_ids))
+    return bert_data
+
+def prepare_bert_input(orig_tokens, orig_tag, tag2id, config):
+    tokens = []
+    label_ids = []
+
+    assert len(orig_tag) == len(orig_tokens), orig_tag + orig_tokens
+
+    for i, t in enumerate(orig_tokens):
+        label_t = tag2id[orig_tag[i]]
+        bert_tokens = bert_tokenizer.tokenize(t)
+        bert_tokens_len = len(bert_tokens)
+        if bert_tokens_len > 0:
+            tokens.extend(bert_tokens)
+            label_ids.append(label_t)
+
+        # pad label if multiple tokens for a single word
+        if bert_tokens_len > 1:
+            label_ids.extend([const.label_pad_id] * (bert_tokens_len - 1))
+
+    assert len(tokens) == len(label_ids)
+    ###truncate large sequence###
+    tokens = tokens[:config.max_seq_length]
+    label_ids = label_ids[:config.max_seq_length]
+    ##############################
+    segment_ids = [const.sequence_a_segment_id] * len(tokens)
+
+    tokens = [const.cls_token] + tokens + [const.sep_token]
+
+    label_ids = [const.label_pad_id] + label_ids + [const.label_pad_id]
+    segment_ids = [const.cls_token_segment_id] + segment_ids + [const.sequence_a_segment_id]
+    input_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
+
+    assert len(input_ids) == len(label_ids)
+    assert len(input_ids) == len(segment_ids)
+
+    input_ids = torch.tensor(input_ids).long()
+    label_ids = torch.tensor(label_ids).long()
+    segment_ids = torch.tensor(segment_ids).long()
+    return input_ids, label_ids, segment_ids, tokens
+
+def create_batch(train_data, batch_ids, is_cuda):
+    max_len = max([len(train_data[bi][0]) for bi in batch_ids])
+    batch_input_ids = []
+    batch_label_ids = []
+    batch_segment_ids = []
+    for bi in batch_ids:
+        input_ids, label_ids, segment_ids = train_data[bi]
+        pad_len = max_len - len(input_ids)
+        padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id)
+        batch_input_ids.append(padding_op(input_ids).unsqueeze(0))
+        padding_op = nn.ConstantPad1d((0, pad_len), const.label_pad_id)
+        batch_label_ids.append(padding_op(label_ids).unsqueeze(0))
+        padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_segment_id)
+        batch_segment_ids.append(padding_op(segment_ids).unsqueeze(0))
+
+    batch_input_ids = torch.cat(batch_input_ids)
+    batch_label_ids = torch.cat(batch_label_ids)
+    batch_segment_ids = torch.cat(batch_segment_ids)
+
+    att_mask = batch_input_ids.ne(const.pad_token_id)
+
+    if is_cuda:
+        batch_input_ids = batch_input_ids.cuda()
+        batch_label_ids = batch_label_ids.cuda()
+        batch_segment_ids = batch_segment_ids.cuda()
+        att_mask = att_mask.cuda()
+
+    inputs = {'input_ids': batch_input_ids,
+              'attention_mask': att_mask,
+              'token_type_ids': batch_segment_ids,
+              'labels': batch_label_ids}
+    return inputs
+
diff --git a/NER_BERT/const.py b/NER_BERT/const.py
new file mode 100644
index 0000000..7624956
--- /dev/null
+++ b/NER_BERT/const.py
@@ -0,0 +1,44 @@
+from torch.nn import CrossEntropyLoss
+
+pad_token_id = 0
+
+ENTITY_OTHER = 'O'
+ENTITY_BEGIN = 'B-'
+ENTITY_CONT = 'I-'
+ENTITY_SINGLE = 'S-'
+ENTITY_END = 'E-'
+
+UNK_INTENT = 'unknown'
+
+label_pad_id = CrossEntropyLoss().ignore_index
+sep_token = "[SEP]"
+cls_token = "[CLS]"
+cls_token_segment_id = 0
+sequence_a_segment_id = 0
+pad_token_segment_id = 0
+
+MODEL_TYPE = 'bert-base-uncased'
+
+ENTITY_NAMES=[
+"Appeal_to_Authority",
+"Appeal_to_fear-prejudice",
+"Black-and-White_Fallacy",
+"Causal_Oversimplification",
+"Doubt",
+"Exaggeration,Minimisation",
+"Flag-Waving",
+"Loaded_Language",
+"Name_Calling,Labeling",
+"Obfuscation,Intentional_Vagueness,Confusion",
+"Repetition",
+"Slogans",
+"Thought-terminating_Cliches",
+"Whataboutism,Straw_Men,Red_Herring",
+"Bandwagon,Reductio_ad_hitlerum"
+]
+
+_UNK = "<UNK>"
+_PAD = "<PAD>"
+_START_VOCAB = [_UNK, _PAD]
+UNK_ID = 0
+PAD_ID = 1
\ No newline at end of file
diff --git a/NER_BERT/crf.py b/NER_BERT/crf.py
new file mode 100644
index 0000000..d5ebe09
--- /dev/null
+++ b/NER_BERT/crf.py
@@ -0,0 +1,182 @@
+from __future__ import unicode_literals, print_function, division
+
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+
+is_cuda = torch.cuda.is_available()
+
+class CRF_Loss(nn.Module):
+    def __init__(self, tagset_size, pad_token_id, tag_pad_id):
+        super(CRF_Loss, self).__init__()
+        self.start_tag = tagset_size
+        self.end_tag = tagset_size + 1
+        self.num_tags = tagset_size + 2
+        self.tag_pad_id = tag_pad_id
+        self.pad_token_id = pad_token_id
+
+        self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))
+        nn.init.constant_(self.transitions, -np.log(self.num_tags))
+
+        self.transitions.data[self.end_tag, :] = -10000
+        self.transitions.data[:, self.start_tag] = -10000
+
+    def get_log_p_z(self, emissions, mask):
+        seq_len = emissions.shape[1]
+        log_alpha = emissions[:, 0].clone()
+        log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
+
+        for idx in range(1, seq_len):
+            broadcast_emissions = emissions[:, idx].unsqueeze(1)
+            broadcast_transitions = self.transitions[ : self.start_tag, : self.start_tag].unsqueeze(0)
+            broadcast_logprob = log_alpha.unsqueeze(2)
+            score = broadcast_logprob + broadcast_emissions + broadcast_transitions
+
+            score = torch.logsumexp(score, 1)
+            log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (1.0 - mask[:, idx].unsqueeze(1))
+
+        log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+        return torch.logsumexp(log_alpha.squeeze(1), 1)
+
+    def get_log_p_Y_X(self, emissions, mask, orig_tags):
+        seq_len = emissions.shape[1]
+        tags = orig_tags.clone()
+        tags[tags < 0] = 0
+
+        llh = self.transitions[self.start_tag, tags[:, 0]].unsqueeze(1)
+        llh += emissions[:, 0, :].gather(1, tags[:, 0].view(-1, 1)) * mask[:, 0].unsqueeze(1)
+
+        for idx in range(1, seq_len):
+            old_state, new_state = (
+                tags[:, idx - 1].view(-1, 1),
+                tags[:, idx].view(-1, 1),
+            )
+            emission_scores = emissions[:, idx, :].gather(1, new_state)
+            transition_scores = self.transitions[old_state, new_state]
+            llh += (emission_scores + transition_scores) * mask[:, idx].unsqueeze(1)
+
+        last_tag_indices = mask.sum(1, dtype=torch.long) - 1
+        last_tags = tags.gather(1, last_tag_indices.view(-1, 1))
+
+        llh += self.transitions[last_tags.squeeze(1), self.end_tag].unsqueeze(1)
+
+        return llh.squeeze(1)
+
+    def log_likelihood(self, emissions, tags, mask):
+        log_z = self.get_log_p_z(emissions, mask)
+        log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
+        return log_p_y_x - log_z
+
+    def get_crf_loss(self, logits, y):
+        mask = y.ne(self.tag_pad_id)
+        s_lens = mask.sum(1)
+        loss = -1 * self.log_likelihood(logits, y, mask.float())
+        loss = loss / s_lens.float()
+        loss = loss.mean()
+        return loss
+
+    def viterbi_decode(self, emissions, mask):
+        mask = mask.float()
+        b, seq_len, d = emissions.shape
+        log_prob = emissions[:, 0].clone()
+        log_prob += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
+
+        end_scores = log_prob + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+
+        best_scores_list = []
+        best_scores_list.append(end_scores.unsqueeze(1))
+
+        best_paths_0 = torch.Tensor().long()
+        if is_cuda:
+            best_paths_0 = best_paths_0.cuda()
+        best_paths_list = [best_paths_0]
+
+        for idx in range(1, seq_len):
+            broadcast_emissions = emissions[:, idx].unsqueeze(1)
+            broadcast_transmissions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0)
+            broadcast_log_prob = log_prob.unsqueeze(2)
+            score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob
+            max_scores, max_score_indices = torch.max(score, 1)
+            best_paths_list.append(max_score_indices.unsqueeze(1))
+            end_scores = max_scores + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+
+            best_scores_list.append(end_scores.unsqueeze(1))
+            log_prob = max_scores
+
+        best_scores = torch.cat(best_scores_list, 1).float()
+        best_paths = torch.cat(best_paths_list, 1)
+
+        max_scores, max_indices_from_scores = torch.max(best_scores, 2)
+
+        valid_index_tensor = torch.tensor(0).long()
+        padding_tensor = torch.tensor(self.tag_pad_id).long()
+        
+        if is_cuda:
+            valid_index_tensor = valid_index_tensor.cuda()
+            padding_tensor = padding_tensor.cuda()
+        #alternative to where
+        #curr_mask = mask[:, seq_len - 1].float()
+        #labels = max_indices_from_scores[:, seq_len - 1] * curr_mask + torch.logical_not(curr_mask) * padding_tensor
+
+        labels = max_indices_from_scores[:, seq_len - 1]
+        labels = torch.where(mask[:, seq_len - 1] != 1.0, padding_tensor, labels)
+        all_labels = labels.unsqueeze(1).long()
+        #####
+        labels_score = max_scores[:, seq_len - 1]
+        all_labels_score = labels_score.unsqueeze(1)
+        ####
+        for idx in range(seq_len - 2, -1, -1):
+            indices_for_lookup = all_labels[:, -1].clone()
+            indices_for_lookup = torch.where(indices_for_lookup == self.tag_pad_id,
+                                             valid_index_tensor,
+                                             indices_for_lookup)
+
+            indices_from_prev_pos = best_paths[:, idx, :].gather(1, indices_for_lookup.view(-1, 1).long()).squeeze(1)
+            indices_from_prev_pos = torch.where(mask[:, idx + 1] != 1.0, padding_tensor, indices_from_prev_pos)
+
+            indices_from_max_scores = max_indices_from_scores[:, idx]
+            indices_from_max_scores = torch.where(mask[:, idx + 1] == 1.0, padding_tensor, indices_from_max_scores)
+
+            labels = torch.where(indices_from_max_scores == self.tag_pad_id,
+                                 indices_from_prev_pos,
+                                 indices_from_max_scores)
+            # Set to ignore_index if present state is not valid.
+            labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels)
+            all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)
+            ######
+            labels_score = max_scores[:, idx]
+            all_labels_score = torch.cat((all_labels_score, labels_score.view(-1, 1)), 1)
+            ####
+        #think about squeezing this score between 0 and 1
+        last_tag_indices = mask.sum(1, dtype=torch.long) - 1
+        sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1)
+        all_labels = torch.flip(all_labels, [1])
+        all_labels_score = torch.flip(all_labels_score, [1])
+
+        return sentence_score, all_labels, all_labels_score
+
+    def structural_perceptron_loss(self, emissions, tags):
+        mask = tags.ne(self.tag_pad_id).float()
+
+        best_scores, pred = self.viterbi_decode(emissions, mask, is_cuda)
+        log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
+
+        delta = torch.sum(tags.ne(pred).float()*mask, 1)
+
+        margin_loss = torch.clamp(best_scores + delta - log_p_y_x, min=0.0)
+        return margin_loss
+
+    def bert_output2crf_input(self, logits_ner, labels):
+        mask = labels.ne(self.tag_pad_id)
+        lens = mask.sum(1).view(-1).tolist()
+
+        logits_selected = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1])
+        logits_split = torch.split(logits_selected, lens)
+        logits_padded = pad_sequence(logits_split, batch_first=True, padding_value=self.pad_token_id)
+
+        labels_selected = torch.masked_select(labels, mask)
+        labels_split = torch.split(labels_selected, lens)
+        labels_padded = pad_sequence(labels_split, batch_first=True, padding_value=self.tag_pad_id)
+
+        return logits_padded, labels_padded
diff --git a/NER_BERT/dataset_utils.py b/NER_BERT/dataset_utils.py
new file mode 100644
index 0000000..fc45444
--- /dev/null
+++ b/NER_BERT/dataset_utils.py
@@ -0,0 +1,320 @@
+from __future__ import absolute_import, division, print_function
+import glob
+import os.path
+import codecs
+from pathlib import Path
+import os
+from collections import defaultdict
+from sklearn.model_selection import train_test_split
+import json
+import re
+
+import const
+
+def tokenize(w):
+    q = w
+    contr_dict = {"’": "'",
+                  "i\'m": "i am",
+                  "won\'t": " will not",
+                  "\'s": " s",
+                  "\'ll": " will",
+                  "\'ve": " have",
+                  "n\'t": " not",
+                  "\'re": " are",
+                  "\'d": " would",
+                  "y'all": " all of you"}
+
+    for contr in contr_dict:
+        q = q.replace(contr, contr_dict[contr])
+
+    q_arr = re.findall(r"[\w]+[']*[\w]+|[\w]+|[.,!?;:]", q, re.UNICODE)
+
+    q = ' '.join(q_arr)
+    q = re.sub('[0-9]{5,}', '#####', q)
+    q = re.sub('[0-9]{4}', '####', q)
+    q = re.sub('[0-9]{3}', '###', q)
+    q = re.sub('[0-9]{2}', '##', q)
+
+    q = q.strip().lower().split()
+    if len(q) == 0:
+        return [w]
+    return q
+
+def tagid2tag_seq(tag_vocab, tagid_seq):
+    return [[tag_vocab[t] for t in tag_seq] for tag_seq in tagid_seq]
+
+def get_tokenize_tag(tagging_type, t, n):
+    if n == 1:
+        return [t]
+    if t == const.ENTITY_OTHER or t.startswith(const.ENTITY_CONT) or tagging_type == 'B':
+        return [t] * n
+
+    name = t[2:]
+    start = t[:2]
+    tags = []
+    if start == const.ENTITY_BEGIN:
+        tags.append(t)
+        for i in range(1, n):
+            tags.append(const.ENTITY_CONT + name)
+        return tags
+
+    if tagging_type == "BIOES":
+        if start == const.ENTITY_SINGLE:
+            tags.append(t)
+            for i in range(1, n-1):
+                tags.append(const.ENTITY_CONT + name)
+            tags.append(const.ENTITY_END + name)
+        elif start == const.ENTITY_END:
+            for i in range(n-1):
+                tags.append(const.ENTITY_CONT + name)
+            tags.append(t)
+
+        return tags
+
+def get_tag_vocab(config):
+    tags = [const.ENTITY_OTHER]
+    if config.tagging_type == 'B':
+        tags.append(const.ENTITY_BEGIN)
+    else:
+        tags.extend([const.ENTITY_BEGIN + t for t in const.ENTITY_NAMES])
+        tags.extend([const.ENTITY_CONT + t for t in const.ENTITY_NAMES])
+
+        if config.tagging_type == "BIOES":
+            tags.extend([const.ENTITY_END + t for t in const.ENTITY_NAMES])
+            tags.extend([const.ENTITY_SINGLE + t for t in const.ENTITY_NAMES])
+
+    return tags
+
+def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
+    file_list = glob.glob(os.path.join(folder_name, file_pattern))
+    articles = []
+    for filename in sorted(file_list):
+        article_id = os.path.basename(filename).split(".")[0][7:]
+        with codecs.open(filename, "r", encoding="utf8") as f:
+            articles.append((article_id, f.read()))
+    return articles
+
+
+def parse_label(label_path):
+    labels = []
+    f = Path(label_path)
+
+    if not f.exists():
+        return labels
+
+    for line in open(label_path):
+        parts = line.strip().split('\t')
+        labels.append({'start': int(parts[2]), 'end': int(parts[3]), 'type': parts[1]})
+
+    labels.sort(key=lambda s: (s['start'], -s['end']))
+    return labels
+
+def clean_text(article):
+    sentences = article.split('\n')
+    end = -1
+    res = []
+    for sentence in sentences:
+        start = end + 1
+        end = start + len(sentence)  # length of sequence
+        if sentence != "":  # if not empty line
+            res.append({'start': start, 'end': end, 'sentence': sentence})
+    return res
+
+def get_overlapping_entities(entities):
+    etree = defaultdict(set)
+    # inefficeint but clean
+    for a in range(len(entities)):
+        ea = entities[a]
+        for b in range(a + 1, len(entities)):
+            eb = entities[b]
+            overlap_start = max(ea['start'], eb['start'])
+            overlap_end = min(ea['end'], eb['end'])
+            if overlap_start <= overlap_end:
+                # if eb['end'] > ea['end']:
+                #    print('partial', ea, eb)
+                etree[a].add(b)
+                etree[b].add(a)
+    assert all([(a in etree) for k in etree for a in etree[k]])
+    # assert all([len(etree[k]) == 1 for k in etree])
+    return etree
+
+def get_per_sentence_entity(entities, ds, de):
+    d_entities = []
+    for a in range(len(entities)):
+        ea = entities[a]
+        overlap_start = max(ea['start'], ds)
+        overlap_end = min(ea['end'], de)
+        if overlap_start <= overlap_end:
+            d_entities.append({
+                'type': ea['type'],
+                'start': overlap_start - ds,
+                'end': overlap_end - ds
+            })
+    d_entities.sort(key=lambda s: (s['start'], -s['end']))
+    return d_entities
+
+
+def get_non_overlapping_seq(etree, n):
+    # get non overlapping entity sequence
+    ex = set()
+    if len(etree) > 0:
+        for k in etree:
+            ex.add(tuple([i for i in range(n) if i not in etree[k]]))
+    else:
+        ex.add(tuple(range(n)))
+    return ex
+
+
+def get_tag_seq(n, name, tagging_type):
+    tags = []
+    for i in range(n):
+        tag = None
+        if tagging_type == 'IOBES':
+            if i == 0:
+                if n == 1:
+                    tag = const.ENTITY_SINGLE
+                else:
+                    tag = const.ENTITY_BEGIN
+            elif i == n - 1:
+                tag = const.ENTITY_END
+            else:
+                tag = const.ENTITY_CONT
+        elif tagging_type == 'IOB':
+            if i == 0:
+                tag = const.ENTITY_BEGIN
+            else:
+                tag = const.ENTITY_CONT
+        elif tagging_type == 'B':
+            tag = const.ENTITY_BEGIN
+        else:
+            raise Exception('tagging_type no recognized')
+        assert tag is not None
+        if tagging_type == 'B':
+            tags.append(tag)
+        else:
+            tags.append(tag + name)
+
+    return tags
+
+def encode_tokens_json(e, sentence, d_entities):
+    tokens = []
+    labels = []
+    curr = 0
+    for a in sorted(e):
+        ea = d_entities[a]
+        pre = sentence[curr:ea['start']].split()
+        span = sentence[ea['start']:ea['end']].split()
+
+        tokens.extend(pre)
+        start = len(tokens)
+        tokens.extend(span)
+        end = len(tokens)
+        labels.append({ 'type': ea['type'],
+                        'start': start,
+                        'end' : end})
+        curr = ea['end']
+
+    pre = sentence[curr:].split()
+    tokens.extend(pre)
+
+    return {'tokens': tokens,
+            'labels': labels}
+
+def get_data_train_dev(root_dir, filename, config):
+    examples = []
+    for line in codecs.open(os.path.join(root_dir, filename), "r", encoding="utf8"):
+        datum = json.loads(line)
+        orig_tokens = datum['tokens']
+        assert len(orig_tokens) > 0, line
+        orig_tags = [const.ENTITY_OTHER]* len(datum['tokens'])
+        for e in datum['labels']:
+            orig_tags[e['start']:e['end']] = get_tag_seq(e['end'] - e['start'], e['type'], config.tagging_type)
+
+        tokens = []
+        tags = []
+        for i, w in enumerate(orig_tokens):
+            w_i = tokenize(w)
+            t_i = orig_tags[i]
+            tokens.extend(w_i)
+            tokenized_tags = get_tokenize_tag(config.tagging_type, t_i, len(w_i))
+            tags.extend(tokenized_tags)
+
+            assert len(w_i) == len(tokenized_tags), f'{w_i} => {tokenized_tags}'
+        assert len(tokens) == len(tags) and len(tokens) > 0, f'{tokens} => {tags}'
+        examples.append((tokens, tags))
+    return examples
+
+
+def get_data_test(root_dir, filename):
+    examples = []
+    metadata = []
+
+    for line in codecs.open(os.path.join(root_dir, filename), "r", encoding="utf8"):
+        datum = json.loads(line)
+        orig_tokens = datum['tokens']
+        tokens = []
+        ignore_mapping = []
+        data_tokens = []
+        for i, w in enumerate(orig_tokens):
+            w_i = tokenize(w)
+            tokens.extend(w_i)
+            ignore = [0]*len(w_i)
+            ignore[0] = 1
+            ignore_mapping.extend(ignore)
+            data_tokens.extend([w]*len(w_i))
+
+        tags = [const.ENTITY_OTHER] * len(tokens)
+        examples.append((tokens, tags))
+        metadata.append({'article_id': datum['article_id'],
+                         'start_sentence': datum['start_sentence'],
+                         'end_sentence': datum['end_sentence'],
+                         'ignore_mapping': ignore_mapping,
+                         'data_tokens': data_tokens
+                         })
+    return examples, metadata
+
+
+def dump_data(root_dir):
+    train_data = read_articles_from_file_list(os.path.join(root_dir, 'train-articles'))
+    label_dir = os.path.join(root_dir, 'train-labels-task2-technique-classification')
+
+    examples = []
+    for article_id, line in train_data:
+        entities = parse_label(os.path.join(label_dir, f'article{article_id}.task2-TC.labels'))
+
+        for d in clean_text(line):
+            d_entities = get_per_sentence_entity(entities, d['start'], d['end'])
+            etree = get_overlapping_entities(d_entities)
+            ex = get_non_overlapping_seq(etree, len(d_entities))
+
+            sentence = d['sentence']
+            for e in ex:
+                datum = encode_tokens_json(e, sentence, d_entities)
+                datum['article_id'] = article_id
+                examples.append(json.dumps(datum))
+
+    train_data, dev_data = train_test_split(examples, test_size=0.2)
+    with codecs.open(os.path.join(root_dir, 'train.jsonl'), "w", encoding="utf8") as f:
+        f.write('\n'.join(train_data) + '\n')
+    with codecs.open(os.path.join(root_dir, 'dev.jsonl'), "w", encoding="utf8") as f:
+        f.write('\n'.join(dev_data) + '\n')
+
+def dump_data_test(root_dir):
+    examples = []
+    dev_data = read_articles_from_file_list(os.path.join(root_dir, 'dev-articles'))
+
+    for article_id, line in dev_data:
+        for d in clean_text(line):
+            datum = {'tokens':d['sentence'].split(),
+                             'start_sentence': d['start'],
+                             'end_sentence': d['end'],
+                             'article_id':article_id}
+            examples.append(json.dumps(datum))
+
+    with codecs.open(os.path.join(root_dir, 'test_phase0.jsonl'), "w", encoding="utf8") as f:
+        f.write('\n'.join(examples) + '\n')
+
+if __name__ == "__main__":
+    data_dir = os.path.join(os.path.expanduser("~"), 'prop/datasets')
+    dump_data(data_dir)
+    dump_data_test(data_dir)
diff --git a/NER_BERT/decoder.py b/NER_BERT/decoder.py
new file mode 100644
index 0000000..80c1545
--- /dev/null
+++ b/NER_BERT/decoder.py
@@ -0,0 +1,111 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+#reference https://github.com/allenai/allennlp/blob/a7265c04078964ea2b80a78fc3967bde8d16072d/allennlp/nn/util.py#L403
+
+@torch.jit.script
+def viterbi_decode_single_jit(tag_sequence, transition_matrix):
+    top_k = 1
+    sequence_length, num_tags = tag_sequence.size()
+    num_tags = num_tags + 2
+
+    zero_sentinel = torch.zeros(1, num_tags)
+    extra_tags_sentinel = torch.ones(sequence_length, 2) * -10000
+    tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1)
+    tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0)
+
+    path_indices = torch.zeros(num_tags, dtype=torch.long, device=tag_sequence.device).unsqueeze(0)
+    path_scores = tag_sequence[0, :].unsqueeze(0)
+
+    for t in range(1, tag_sequence.size(0)):
+        summed_potentials = path_scores.unsqueeze(2) + transition_matrix
+        summed_potentials = summed_potentials.view(-1, num_tags)
+
+        scores, paths = torch.topk(summed_potentials, k=top_k, dim=0)
+
+        path_scores = tag_sequence[t, :].unsqueeze(0) + scores
+        path_indices = torch.cat([path_indices, paths], 0)
+
+    path_indices = path_indices[1:]
+    path_scores_v = path_scores.view(-1)
+    viterbi_scores, best_paths = torch.topk(path_scores_v, k=top_k, dim=0)
+
+    n_paths_indices = path_indices.size(0)
+
+    viterbi_paths = torch.zeros(sequence_length, dtype=torch.long, device=tag_sequence.device).unsqueeze(0)
+    tag_scores = torch.zeros(sequence_length, device=tag_sequence.device).unsqueeze(0)
+    for i in range(top_k):
+        viterbi_path = best_paths[0].unsqueeze(0)
+
+        for k in range(n_paths_indices):
+            t_rev = n_paths_indices - k - 1
+            backward_timestep = path_indices[t_rev, :]
+            tag_id = torch.index_select(backward_timestep.view(-1), 0, viterbi_path[-1])
+            viterbi_path = torch.cat([viterbi_path, tag_id], -1)
+
+        viterbi_path = viterbi_path.flip(0)
+        viterbi_path = viterbi_path % num_tags
+        viterbi_path = viterbi_path[1:-1]
+        viterbi_paths = torch.cat([viterbi_paths, viterbi_path.unsqueeze(0)], 0)
+
+        tag_score = torch.gather(tag_sequence[1:-1], 1, viterbi_path.unsqueeze(-1)).view(-1)
+        tag_scores = torch.cat([tag_scores, tag_score.unsqueeze(0)], 0)
+    viterbi_paths = viterbi_paths[1:]
+    tag_scores = tag_scores[1:]
+    return viterbi_paths, tag_scores.exp(),  viterbi_scores.exp()
+
+def predict_ner_single_jit(logits_ner, labels, transition_matrix, tag_pad_id):
+    mask = labels.ne(tag_pad_id)
+    logits_padded = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1])
+    return viterbi_decode_single_jit(logits_padded, transition_matrix)
+
+
+#######
+def viterbi_decode_single_python(e, t):
+    num_tags = len(e[0])
+    seq_len = len(e)
+    start_tag_id = num_tags
+    end_tag_id = num_tags + 1
+
+    dp_links = []
+    dp = [0.] * num_tags
+    curr_dp_links = []
+    for j in range(num_tags):
+        dp[j] = t[start_tag_id, j] + e[0][j]
+        curr_dp_links.append(-1)
+    dp_links.append(curr_dp_links)
+
+    for i in range(1, seq_len):
+        new_dp = []
+        curr_dp_links = []
+        for j in range(num_tags):
+            all_candidates = [np.logaddexp(t[k, j] + e[i][j], dp[k]) for k in range(num_tags)]
+            max_k = max(range(num_tags), key=lambda i: all_candidates[i])
+            new_dp.append(all_candidates[max_k])
+            curr_dp_links.append(max_k)
+        dp = new_dp
+        dp_links.append(curr_dp_links)
+
+    all_candidates = [np.logaddexp(t[k, end_tag_id], dp[k]) for k in range(num_tags)]
+    max_k = max(range(num_tags), key=lambda i: all_candidates[i])
+    sentence_score = all_candidates[max_k]
+
+    all_labels = [max_k]
+    all_labels_score = [t[max_k, end_tag_id]]
+
+    for i in range(seq_len - 1, 0, -1):
+        curr_k = dp_links[i][max_k]
+        all_labels.append(curr_k)
+        all_labels_score.append(t[curr_k, max_k] + e[i][max_k])
+        max_k = curr_k
+
+    return sentence_score, all_labels[::-1], all_labels_score[::-1]
+
+def predict_ner_single_python(logits_ner, labels, transitions, tag_pad_id):
+    mask = labels.ne(tag_pad_id)
+    logits_ner_padded = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1])
+    logits_ner_padded_lsf = F.log_softmax(logits_ner_padded, dim=-1)
+    score_sentence, pred_tags, score_tags = viterbi_decode_single_python(logits_ner_padded_lsf.cpu().data.numpy(), transitions.cpu().data.numpy())
+    return [pred_tags], [np.exp(score_tags)], [np.exp(score_sentence)]
diff --git a/NER_BERT/eval_util.py b/NER_BERT/eval_util.py
new file mode 100644
index 0000000..d3e8a0f
--- /dev/null
+++ b/NER_BERT/eval_util.py
@@ -0,0 +1,74 @@
+from __future__ import absolute_import, division, print_function
+
+from seqeval.metrics import precision_score, recall_score, f1_score
+
+import const
+import os
+import dataset_utils
+
+# eval
+def get_chunks(seq, ignore_I_mismatch=False):
+    chunks = []
+    chunk_type, chunk_start = None, None
+    for i, tok in enumerate(seq):
+        if tok == const.ENTITY_OTHER:
+            if chunk_type is not None:
+                chunks.append((chunk_type, chunk_start, i))
+                chunk_type, chunk_start = None, None
+        else:
+            curr_chunk_type = tok[2:]
+            chunk_prefix = tok[:2]
+            if chunk_prefix == const.ENTITY_BEGIN:
+                if chunk_type is not None:
+                    chunks.append((chunk_type, chunk_start, i))
+
+                chunk_type, chunk_start = curr_chunk_type, i
+            elif chunk_prefix == const.ENTITY_CONT and not ignore_I_mismatch:
+                if chunk_type is not None and chunk_type != curr_chunk_type:
+                    chunks.append((chunk_type, chunk_start, i))
+                    chunk_type, chunk_start = None, None
+    # end condition
+    if chunk_type is not None:
+        chunks.append((chunk_type, chunk_start, len(seq)))
+
+    chunks = list(set(chunks))
+    chunks.sort(key=lambda s: s[0])
+    return chunks
+
+def evaluate(gold_label_list, preds_list, config):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    gold_label_list = dataset_utils.tagid2tag_seq(tag_vocab, gold_label_list)
+    preds_list = dataset_utils.tagid2tag_seq(tag_vocab, preds_list)
+
+    results = {
+        "precision": precision_score(gold_label_list, preds_list),
+        "recall": recall_score(gold_label_list, preds_list),
+        "f1": f1_score(gold_label_list, preds_list)
+    }
+    return results
+
+
+def dump_result(preds_list, metadata, test_data, root_dir, filename, config):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    preds_list = dataset_utils.tagid2tag_seq(tag_vocab, preds_list)
+
+    tc = os.path.join(root_dir, 'tc_' + filename)
+    si = os.path.join(root_dir, 'si_' + filename)
+
+    with open(tc, "w") as tc_writer, open(si, "w") as si_writer:
+        for i, t in enumerate(preds_list):
+            article = metadata[i]
+            sen_start = article['start_sentence']
+            article_id = article['article_id']
+            ignore_mapping = article['ignore_mapping']
+            data_tokens = article['data_tokens']
+
+            for type, start, end in get_chunks(t):
+                #adjust start end
+                orig_tokens = [data_tokens[i] for i in range(start) if ignore_mapping[i] == 1]
+                start_boundary = len(' '.join(orig_tokens)) + sen_start
+                orig_tokens = [data_tokens[i] for i in range(start, end) if ignore_mapping[i] == 1]
+                end_boundary = start_boundary + len(' '.join(orig_tokens))
+                si_writer.write(f'{article_id}\t{start_boundary}\t{end_boundary}\n')
+                tc_writer.write(f'{article_id}\t{type}\t{start_boundary}\t{end_boundary}\n')
+
diff --git a/NER_BERT/lstm_data_util.py b/NER_BERT/lstm_data_util.py
new file mode 100644
index 0000000..c460b8b
--- /dev/null
+++ b/NER_BERT/lstm_data_util.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+import const
+import dataset_utils
+
+def get_data(examples, vocab, config):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    tag2id = {t: i for i, t in enumerate(tag_vocab)}
+
+    all_data = []
+    for orig_tokens, orig_tag in examples:
+        input_ids, char_ids, label_ids = prepare_input(orig_tokens, orig_tag, tag2id, vocab, config)
+        all_data.append((input_ids, char_ids, label_ids))
+    return all_data
+
+def create_char_batch(char_id_seq):
+    batch_char_ids = []
+    max_len = max([len(char_ids) for char_ids in char_id_seq])
+    for char_ids in char_id_seq:
+        pad_len = max_len - len(char_ids)
+        padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id)
+        batch_char_ids.append(padding_op(char_ids).unsqueeze(0))
+
+    batch_char_ids = torch.cat(batch_char_ids)
+    mask = batch_char_ids.ne(const.pad_token_id)
+
+    return batch_char_ids, mask
+
+def prepare_input(orig_tokens, orig_tag, tag2id, vocab, config):
+    input_ids = []
+    label_ids = []
+    char_id_seq = []
+
+    assert len(orig_tag) == len(orig_tokens), orig_tag + orig_tokens
+
+    for i, w in enumerate(orig_tokens):
+        label_id = tag2id[orig_tag[i]]
+        w_id = vocab['word2id'][w] if w in vocab['word2id'] else const.UNK_ID
+        input_ids.append(w_id)
+        label_ids.append(label_id)
+        w_char_ids = [vocab['char2id'][c] if c in vocab['char2id'] else const.UNK_ID for c in w]
+        w_char_ids = torch.tensor(w_char_ids).long()
+
+        char_id_seq.append(w_char_ids)
+
+    char_ids = create_char_batch(char_id_seq)
+    input_ids = torch.tensor(input_ids).long()
+    label_ids = torch.tensor(label_ids).long()
+    return input_ids, char_ids, label_ids
+
+def create_batch(train_data, batch_ids, is_cuda):
+    max_len = max([len(train_data[bi][0]) for bi in batch_ids])
+    batch_input_ids = []
+    batch_char_ids = []
+    batch_label_ids = []
+
+    for bi in batch_ids:
+        input_ids, char_ids, label_ids = train_data[bi]
+        batch_char_ids.append(char_ids)
+        pad_len = max_len - len(input_ids)
+        padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id)
+        batch_input_ids.append(padding_op(input_ids).unsqueeze(0))
+        padding_op = nn.ConstantPad1d((0, pad_len), const.label_pad_id)
+        batch_label_ids.append(padding_op(label_ids).unsqueeze(0))
+
+    batch_input_ids = torch.cat(batch_input_ids)
+    batch_label_ids = torch.cat(batch_label_ids)
+
+    att_mask = batch_input_ids.ne(const.pad_token_id)
+
+    if is_cuda:
+        batch_input_ids = batch_input_ids.cuda()
+        batch_label_ids = batch_label_ids.cuda()
+        att_mask = att_mask.cuda()
+
+    inputs = {'word_ids': batch_input_ids,
+              'mask': att_mask,
+              'char_ids':batch_char_ids,
+              'labels': batch_label_ids}
+    return inputs
diff --git a/NER_BERT/model_bert_train.py b/NER_BERT/model_bert_train.py
new file mode 100644
index 0000000..12006d3
--- /dev/null
+++ b/NER_BERT/model_bert_train.py
@@ -0,0 +1,280 @@
+from __future__ import absolute_import, division, print_function
+
+import os.path
+
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+import random
+import time
+import json
+
+import const
+import eval_util
+import bert_data_util
+import dataset_utils
+
+from torch.nn import CrossEntropyLoss
+from transformers import BertModel, BertConfig
+from transformers import AdamW, get_linear_schedule_with_warmup
+
+is_cuda = torch.cuda.is_available()
+
+class Config(object):
+    def __init__(self):
+        self.num_epoch = 10
+        self.weight_decay = 0.0  # 1e-8
+        self.batch_size = 8
+        self.eval_batch_size = 8
+        self.max_grad_norm = 1.0
+        self.learning_rate = 5e-5
+        self.adam_epsilon = 1e-8
+        self.print_interval = 1000 * self.batch_size
+        self.warmup_steps = 0.0
+        self.max_seq_length = 100  # =32 - 2
+        self.seed = 42
+        self.tagging_type = 'B'
+
+config = Config()
+
+#####set seed
+random.seed(config.seed)
+np.random.seed(config.seed)
+torch.manual_seed(config.seed)
+if is_cuda > 0:
+    torch.cuda.manual_seed_all(config.seed)
+#####set seed end
+
+class MyBertForTokenClassification(nn.Module):
+    def __init__(self, num_tags):
+        super(MyBertForTokenClassification, self).__init__()
+        self.bert = BertModel.from_pretrained(const.MODEL_TYPE)
+
+        self.config = BertConfig.from_pretrained(const.MODEL_TYPE)
+        self.config.num_tags = num_tags
+
+        self.dropout_ner = nn.Dropout(self.config.hidden_dropout_prob)
+        self.classifier_ner = nn.Linear(self.config.hidden_size, num_tags)
+
+        self.classifier_ner.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        self.classifier_ner.bias.data.zero_()
+
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None
+    ):
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        # NER
+        sequence_output = outputs[0]
+        sequence_output = self.dropout_ner(sequence_output)
+        logits_ner = self.classifier_ner(sequence_output)
+
+        outputs = (logits_ner,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # (loss), scores_ner, (hidden_states), (attentions)
+
+    def save_pretrained(self, save_directory):
+        self.config.save_pretrained(save_directory)
+        output_model_file = os.path.join(save_directory, "pytorch_model.bin")
+        torch.save(self.state_dict(), output_model_file)
+
+    def get_loss_greedy(self, logits_ner, labels, attention_mask):
+        loss_fct = CrossEntropyLoss()
+        # Only keep active parts of the loss
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits_ner.view(-1, self.model_config.num_tags)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        loss = loss_fct(active_logits, active_labels)
+        return loss
+    
+    def get_loss_crf(self, logits_ner, labels, attention_mask):
+        logits_padded, labels_padded = self.crf.bert_output2crf_input(logits_ner, labels)
+        loss = self.crf.get_crf_loss(logits_padded, labels_padded)
+        return loss
+
+    def predict_ner_greedy(self, logits_ner, labels):
+        logits_ner = logits_ner.softmax(dim=2)
+        score_tags, pred_tags = logits_ner.max(dim=2)
+        return -1.0, pred_tags, score_tags, labels
+    
+    def predict_ner_viterbi(self, logits_ner, labels):
+        logits_padded, labels_padded = self.crf.bert_output2crf_input(logits_ner, labels)
+        mask = labels_padded.ne(self.crf.tag_pad_id)
+        score_sentence, pred_tags, score_tags = self.crf.viterbi_decode(logits_padded, mask)
+        return score_sentence, pred_tags, score_tags, labels_padded
+
+def get_optimizer(model, config, t_total):
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         "weight_decay": config.weight_decay},
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps,
+                                                num_training_steps=t_total)
+
+    return optimizer, scheduler
+
+
+def predictions(dev_data, model, config):
+    data_size = len(dev_data)
+    ids = np.arange(data_size)
+    eval_loss = 0
+    model.eval()
+
+    gold_label_list = []
+    preds_list = []
+
+    for i in range(0, data_size, config.eval_batch_size):
+        batch_ids = ids[i:i + config.eval_batch_size]
+
+        inputs = bert_data_util.create_batch(dev_data, batch_ids, is_cuda)
+        outputs_t = model(**inputs)
+        loss_t, scores_ner_t = outputs_t[:2]
+
+        eval_loss += loss_t.item()
+        max_value_tag_t, pred_tag_t = torch.max(scores_ner_t, dim=2)
+
+        pred_tag = pred_tag_t.cpu().data.numpy()
+        gold_tag = inputs['labels'].cpu().data.numpy()
+
+        for k, bi in enumerate(batch_ids):
+            s_len = len(dev_data[bi][0])
+            predict_list = []
+            gold_list = []
+            for j in range(s_len):
+                if gold_tag[k][j] == const.label_pad_id:
+                    continue
+                predict_list.append(pred_tag[k][j])
+                gold_list.append(gold_tag[k][j])
+
+            gold_label_list.append(gold_list)
+            preds_list.append(predict_list)
+    eval_loss /= data_size
+    return preds_list, gold_label_list, eval_loss
+
+def train(output_root_dir, train_data_seq, dev_data_seq):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    tag2id = {t: i for i, t in enumerate(tag_vocab)}
+
+    train_data = bert_data_util.get_bert_data(train_data_seq, tag2id, config)
+    dev_data = bert_data_util.get_bert_data(dev_data_seq, tag2id, config)
+
+    model = MyBertForTokenClassification(num_tags=len(tag2id))
+    if is_cuda:
+        model = model.cuda()
+
+    data_size = len(train_data)
+    num_batch = np.ceil(data_size / config.batch_size)
+    t_total = config.num_epoch * num_batch
+
+    optimizer, scheduler = get_optimizer(model, config, t_total)
+
+    exp_loss = None
+    global_step = 0
+    best_dev_f1 = 0
+    model.zero_grad()
+    ids = np.arange(data_size)
+    for epoch in range(config.num_epoch):
+        np.random.shuffle(ids)
+        for i in range(0, data_size, config.batch_size):
+            batch_ids = ids[i:i + config.batch_size]
+
+            model.train()
+            inputs = bert_data_util.create_batch(train_data, batch_ids, is_cuda)
+            outputs = model(**inputs)
+            loss = outputs[0]
+
+            loss.backward()
+            nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
+            optimizer.step()
+            scheduler.step()
+            model.zero_grad()
+            global_step += 1
+
+            exp_loss = 0.99 * exp_loss + 0.01 * loss.item() if exp_loss else loss.item()
+
+            if global_step > 0 and global_step % config.print_interval == 0:
+                print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True)
+
+        preds_list, gold_label_list, eval_loss = predictions(dev_data, model, config)
+        results = eval_util.evaluate(preds_list, gold_label_list, tag_vocab)
+        print(f'{global_step}/{t_total} NER: p/r/f1 {results["precision"]:.5f}/{results["recall"]:.5f}/{results["f1"]:.5f}', flush=True)
+
+        f1 = results['f1']
+        if f1 > best_dev_f1:
+            # output_dir = os.path.join(output_root_dir, 'checkpoint-{}'.format(epoch))
+            output_dir = os.path.join(output_root_dir, 'checkpoint')
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            print(f"Saving model checkpoint to {output_dir}", flush=True)
+
+            model.save_pretrained(output_dir)
+            bert_data_util.bert_tokenizer.save_pretrained(output_dir)
+
+            with open(os.path.join(output_dir, "training_config.json"), 'w') as fout:
+                json.dump(vars(config), fout)
+
+    print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True)
+
+def get_model(model_dir, num_tag):
+    model = MyBertForTokenClassification(num_tags=num_tag)
+
+    #load model
+    save_directory = os.path.join(root_dir, model_dir + '/checkpoint')
+    model_file_path = os.path.join(save_directory, "pytorch_model.bin")
+    print(f'reading model from {model_file_path}')
+    state_dict = torch.load(model_file_path, map_location=lambda storage, location: storage)
+    model.eval()
+    model.load_state_dict(state_dict, strict=False)
+
+    if is_cuda:
+        model = model.cuda()
+    return model
+
+def process_train(root_dir, data_dir):
+    output_root_dir = os.path.join(root_dir, f'dl_model_{int(time.time())}')
+    if not os.path.exists(output_root_dir):
+        os.makedirs(output_root_dir)
+    print(f'model out dir {output_root_dir}', flush=True)
+
+    train_data_seq = dataset_utils.get_data_train_dev(data_dir, 'train.jsonl', config)
+    dev_data_seq = dataset_utils.get_data_train_dev(data_dir, 'dev.jsonl', config)
+
+    train(output_root_dir, train_data_seq, dev_data_seq)
+
+def process_eval(root_dir, model_dir, data_dir, filename):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    tag2id = {t: i for i, t in enumerate(tag_vocab)}
+
+    model = get_model(model_dir, len(tag2id))
+
+    test_data, metadata = dataset_utils.get_data_test(data_dir, filename)
+    test_data_bert = bert_data_util.get_bert_data(test_data, tag2id, config)
+
+    preds_list, _, _ = predictions(test_data_bert, model, config)
+    eval_util.dump_result(preds_list, metadata, tag_vocab, test_data, root_dir, 'boundary_model.txt')
+
+if __name__ == "__main__":
+    root_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp')
+    data_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/datasets')
+    #process_train(root_dir, data_dir)
+    model_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp/dl_model_1579925959')
+    process_eval(root_dir, model_dir, data_dir, 'test_phase0.jsonl')
+
diff --git a/NER_BERT/model_lstm.py b/NER_BERT/model_lstm.py
new file mode 100644
index 0000000..179dbc2
--- /dev/null
+++ b/NER_BERT/model_lstm.py
@@ -0,0 +1,155 @@
+from __future__ import unicode_literals, print_function, division
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+from torch.nn import Conv1d, ReLU
+
+from crf import CRF_Loss
+import model_utils
+
+class CHAR_CONV(torch.nn.Module):
+    def __init__(self,
+                 embedding_dim,
+                 num_filters,
+                 ngram_filter_sizes=(2, 3, 4, 5)):
+        super(CHAR_CONV, self).__init__()
+
+        self.convolution_layers = torch.nn.ModuleList()
+        for ngram_size in ngram_filter_sizes:
+            conv_maxpool = torch.nn.ModuleList()
+            conv_maxpool.extend([Conv1d(
+                in_channels=embedding_dim,
+                out_channels=num_filters,
+                kernel_size=ngram_size),
+                ReLU(),
+                torch.nn.MaxPool1d(kernel_size=num_filters)])
+
+            self.convolution_layers.append(conv_maxpool)
+
+    def forward(self, tokens, mask):
+        tokens = tokens * mask.unsqueeze(-1).float()
+        tokens = torch.transpose(tokens, 1, 2)
+
+        filter_outputs = []
+        for conv_maxpool in self.convolution_layers:
+            filter_outputs.append(conv_maxpool(tokens))
+
+        maxpool_output = torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]
+
+        return maxpool_output
+
+class NER_SOFTMAX_CHAR(nn.Module):
+    def __init__(self, emb_matrix, config, num_tags):
+        super(NER_SOFTMAX_CHAR, self).__init__()
+
+        embd_vector = torch.from_numpy(emb_matrix['word']).float()
+        self.word_embeds = nn.Embedding.from_pretrained(embd_vector, freeze=False)
+
+        embd_vector = torch.from_numpy(emb_matrix['char']).float()
+        self.char_embeds = nn.Embedding.from_pretrained(embd_vector, freeze=False)
+
+        self.lstm_char = nn.LSTM(self.char_embeds.embedding_dim,
+                            config.char_lstm_dim,
+                            num_layers=1, bidirectional=True, batch_first=True)
+
+        input_size = self.word_embeds.embedding_dim + config.char_lstm_dim * 2
+
+        self.lstm = nn.LSTM(input_size,
+                            config.word_lstm_dim,
+                            num_layers=1, bidirectional=True, batch_first=True)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.hidden_layer = nn.Linear(config.word_lstm_dim * 2, config.word_lstm_dim)
+        self.tanh_layer = torch.nn.Tanh()
+
+        self.hidden2tag = nn.Linear(config.word_lstm_dim, num_tags)
+
+        self.config = config
+
+        model_utils.init_lstm_wt(self.lstm_char)
+        model_utils.init_lstm_wt(self.lstm)
+        model_utils.init_linear_wt(self.hidden_layer)
+        model_utils.init_linear_wt(self.hidden2tag)
+
+    def forward(self, word_ids, mask, char_ids):
+        lengths = mask.sum(1, dtype=torch.long)
+
+        max_length = torch.max(lengths)
+        char_emb = []
+        word_embed = self.word_embeds(word_ids)
+        for chars, char_mask in char_ids:
+            char_len = char_mask.sum(1, dtype=torch.long)
+            seq_embed = self.char_embeds(chars)
+            seq_lengths, sort_idx = torch.sort(char_len, descending=True)
+            _, unsort_idx = torch.sort(sort_idx)
+            seq_embed = seq_embed[sort_idx]
+            packed = pack_padded_sequence(seq_embed, seq_lengths, batch_first=True)
+            output, hidden = self.lstm_char(packed)
+            lstm_feats, _ = pad_packed_sequence(output, batch_first=True)
+            lstm_feats = lstm_feats.contiguous()
+            b, t_k, d = list(lstm_feats.size())
+
+            seq_rep = lstm_feats.view(b, t_k, 2, -1) #0 is fwd and 1 is bwd
+
+            last_idx = char_len - 1
+            seq_rep_fwd = seq_rep[unsort_idx, 0, 0]
+            seq_rep_bwd = seq_rep[unsort_idx, last_idx, 1]
+
+            seq_out = torch.cat([seq_rep_fwd, seq_rep_bwd], 1)
+            # fill up the dummy char embedding for padding
+            seq_out = F.pad(seq_out, (0, 0, 0, max_length - seq_out.size(0)))
+            char_emb.append(seq_out.unsqueeze(0))
+
+        char_emb = torch.cat(char_emb, 0) #b x n x c_dim
+
+        word_embed = torch.cat([char_emb, word_embed], 2)
+        word_embed = self.dropout(word_embed)
+
+        lengths = lengths.view(-1).tolist()
+        packed = pack_padded_sequence(word_embed, lengths, batch_first=True, enforce_sorted=False)
+        output, hidden = self.lstm(packed)
+
+        lstm_feats, _ = pad_packed_sequence(output, batch_first=True)  # h dim = B x t_k x n
+        lstm_feats = lstm_feats.contiguous()
+
+        b, t_k, d = list(lstm_feats.size())
+
+        h = self.hidden_layer(lstm_feats.view(-1, d))
+        h = self.tanh_layer(h)
+        logits = self.hidden2tag(h)
+        logits = logits.view(b, t_k, -1)
+
+        return logits
+
+
+class NER_SOFTMAX_CHAR_CRF(nn.Module):
+    def __init__(self, emb_matrix, config, tag_pad_id, num_tags):
+        super(NER_SOFTMAX_CHAR_CRF, self).__init__()
+        self.featurizer = NER_SOFTMAX_CHAR(emb_matrix, config, num_tags)
+        self.crf = CRF_Loss(num_tags, config, tag_pad_id)
+        self.config = config
+
+    def forward(self, word_ids, mask, char_ids, labels=None):
+        output = self.featurizer(word_ids, mask, char_ids)
+        if labels is not None:
+            loss = self.get_loss(output, labels, mask)
+            output = (loss, output)
+        return output
+
+    def get_loss(self, logits, y, mask):
+        if self.config.is_structural_perceptron_loss:
+            loss = self.crf.structural_perceptron_loss(logits, y)
+        else:
+            loss = -1 * self.crf.log_likelihood(logits, y)
+
+        s_lens = mask.sum(1, dtype=torch.long)
+
+        loss = loss / s_lens.float()
+        loss = loss.mean()
+        return loss
+
+    def predict(self, emissions, mask):
+        best_scores, pred = self.crf.viterbi_decode_batch(emissions, mask)
+        return best_scores, pred
diff --git a/NER_BERT/model_lstm_train.py b/NER_BERT/model_lstm_train.py
new file mode 100644
index 0000000..6d36b1c
--- /dev/null
+++ b/NER_BERT/model_lstm_train.py
@@ -0,0 +1,258 @@
+from __future__ import absolute_import, division, print_function
+
+import os.path
+
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+import random
+import time
+import json
+import codecs
+
+import const
+import eval_util
+import bert_data_util
+import dataset_utils
+import model_lstm
+import model_utils
+import lstm_data_util
+
+from torch.optim import Adam
+from torch.optim.lr_scheduler import LambdaLR
+
+is_cuda = torch.cuda.is_available()
+
+class Config(object):
+    def __init__(self):
+        self.num_epoch = 10
+        self.weight_decay = 0.0  # 1e-8
+        self.batch_size = 8
+        self.eval_batch_size = 8
+        self.max_grad_norm = 1.0
+        self.learning_rate = 5e-5
+        self.adam_epsilon = 1e-8
+        self.print_interval = 1000 * self.batch_size
+        self.warmup_steps = 0.0
+        self.max_seq_length = 100  # =32 - 2
+        self.seed = 42
+        self.tagging_type = 'B'
+        self.word_emdb_dim = 100
+        self.word_lstm_dim = 100
+        self.char_embd_dim = 30
+        self.char_lstm_dim = 64
+        self.dropout_rate = 0.15
+        self.is_structural_perceptron_loss = False
+
+config = Config()
+
+#####set seed
+random.seed(config.seed)
+np.random.seed(config.seed)
+torch.manual_seed(config.seed)
+if is_cuda > 0:
+    torch.cuda.manual_seed_all(config.seed)
+#####set seed end
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases linearly after
+    linearly increasing during a warmup period.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+def get_optimizer(model, config, t_total):
+    optimizer = Adam(model.parameters(), amsgrad=True)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps,
+                                                num_training_steps=t_total)
+    return optimizer, scheduler
+
+
+def predictions(dev_data, model, config):
+    data_size = len(dev_data)
+    ids = np.arange(data_size)
+    eval_loss = 0
+    model.eval()
+
+    gold_label_list = []
+    preds_list = []
+
+    for i in range(0, data_size, config.eval_batch_size):
+        batch_ids = ids[i:i + config.eval_batch_size]
+
+        inputs = lstm_data_util.create_batch(dev_data, batch_ids, is_cuda)
+        outputs_t = model(**inputs)
+        loss_t, scores_ner_t = outputs_t[:2]
+        best_scores, pred_tag_t = model.predict(scores_ner_t, inputs['mask'])
+
+        eval_loss += loss_t.item()
+
+        pred_tag = pred_tag_t.cpu().data.numpy()
+        gold_tag = inputs['labels'].cpu().data.numpy()
+
+        for k, bi in enumerate(batch_ids):
+            s_len = len(dev_data[bi][0])
+            predict_list = []
+            gold_list = []
+            for j in range(s_len):
+                if gold_tag[k][j] == const.label_pad_id:
+                    continue
+                predict_list.append(pred_tag[k][j])
+                gold_list.append(gold_tag[k][j])
+
+            gold_label_list.append(gold_list)
+            preds_list.append(predict_list)
+    eval_loss /= data_size
+    return preds_list, gold_label_list, eval_loss
+
+def train(output_root_dir, embd, train_data, dev_data):
+    num_tags = len(dataset_utils.get_tag_vocab(config))
+    model = model_lstm.NER_SOFTMAX_CHAR_CRF(embd, config, const.label_pad_id, num_tags)
+
+    if is_cuda:
+        model = model.cuda()
+
+    data_size = len(train_data)
+    num_batch = np.ceil(data_size / config.batch_size)
+    t_total = config.num_epoch * num_batch
+
+    optimizer, scheduler = get_optimizer(model, config, t_total)
+
+    exp_loss = None
+    global_step = 0
+    best_dev_f1 = 0
+    model.zero_grad()
+    ids = np.arange(data_size)
+    for epoch in range(config.num_epoch):
+        np.random.shuffle(ids)
+        for i in range(0, data_size, config.batch_size):
+            batch_ids = ids[i:i + config.batch_size]
+
+            model.train()
+            inputs = lstm_data_util.create_batch(train_data, batch_ids, is_cuda)
+            outputs = model(**inputs)
+            loss = outputs[0]
+
+            loss.backward()
+            nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
+            optimizer.step()
+            scheduler.step()
+            model.zero_grad()
+            global_step += 1
+
+            exp_loss = 0.99 * exp_loss + 0.01 * loss.item() if exp_loss else loss.item()
+
+            if global_step > 0 and global_step % config.print_interval == 0:
+                print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True)
+
+        preds_list, gold_label_list, eval_loss = predictions(dev_data, model, config)
+        results = eval_util.evaluate(preds_list, gold_label_list, config)
+        print(f'{global_step}/{t_total} NER: p/r/f1 {results["precision"]:.5f}/{results["recall"]:.5f}/{results["f1"]:.5f}', flush=True)
+
+        f1 = results['f1']
+        if f1 > best_dev_f1:
+            # output_dir = os.path.join(output_root_dir, 'checkpoint-{}'.format(epoch))
+            output_dir = os.path.join(output_root_dir, 'checkpoint')
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            print(f"Saving model checkpoint to {output_dir}", flush=True)
+
+            model.save_pretrained(output_dir)
+            bert_data_util.bert_tokenizer.save_pretrained(output_dir)
+
+            with open(os.path.join(output_dir, "training_config.json"), 'w') as fout:
+                json.dump(vars(config), fout)
+
+    print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True)
+
+
+def process_train(root_dir, data_dir, glove_path):
+    output_root_dir = os.path.join(root_dir, f'dl_model_{int(time.time())}')
+    if not os.path.exists(output_root_dir):
+        os.makedirs(output_root_dir)
+    print(f'model out dir {output_root_dir}', flush=True)
+
+    train_data_seq = dataset_utils.get_data_train_dev(data_dir, 'train.jsonl', config)
+    dev_data_seq = dataset_utils.get_data_train_dev(data_dir, 'dev.jsonl', config)
+
+    word_emb_matrix, word2id, id2word = model_utils.get_word_embd(config, glove_path, train_data_seq)
+    char_emb_matrix, char2id, id2char = model_utils.get_char_embd(config, id2word)
+
+    vocab = {
+        'word2id': word2id,
+        'id2word': id2word,
+        'char2id': char2id,
+        'id2char': id2char
+    }
+    embd = {
+        'word': word_emb_matrix,
+        'char': char_emb_matrix
+    }
+    with codecs.open(os.path.join(output_root_dir, 'word.vocab'), "w", encoding="utf8") as f:
+        f.write('\n'.join(id2word) + '\n')
+    with codecs.open(os.path.join(output_root_dir, 'char.vocab'), "w", encoding="utf8") as f:
+        f.write('\n'.join(id2char) + '\n')
+
+    train_data = lstm_data_util.get_data(train_data_seq, vocab, config)
+    dev_data = lstm_data_util.get_data(dev_data_seq, vocab, config)
+
+    train(output_root_dir, embd, train_data, dev_data)
+
+
+def get_model(model_dir, vocab, num_tag):
+    embd = model_utils.get_random_embedding(vocab, config)
+    model = model_lstm.NER_SOFTMAX_CHAR_CRF(embd, config, const.label_pad_id, num_tag)
+
+    #load model
+    save_directory = os.path.join(root_dir, model_dir + '/checkpoint')
+    model_file_path = os.path.join(save_directory, "pytorch_model.bin")
+    print(f'reading model from {model_file_path}')
+    state_dict = torch.load(model_file_path, map_location=lambda storage, location: storage)
+    model.eval()
+    model.load_state_dict(state_dict, strict=False)
+
+    if is_cuda:
+        model = model.cuda()
+    return model
+
+def process_eval(root_dir, model_dir, data_dir, filename):
+    tag_vocab = dataset_utils.get_tag_vocab(config)
+    tag2id = {t: i for i, t in enumerate(tag_vocab)}
+
+    with codecs.open(os.path.join(model_dir, 'word.vocab'), "r", encoding="utf8") as f:
+        id2word = f.readlines().split('\n')
+        word2id = {v: k for k, v in enumerate(id2word)}
+    with codecs.open(os.path.join(model_dir, 'char.vocab'), "r", encoding="utf8") as f:
+        id2char = f.readlines().split('\n')
+        char2id = {v: k for k, v in enumerate(id2char)}
+    vocab = {
+        'word2id': word2id,
+        'id2word': id2word,
+        'char2id': char2id,
+        'id2char': id2char
+    }
+    model = get_model(model_dir, vocab, len(tag2id))
+
+    test_data_seq, metadata = dataset_utils.get_data_test(data_dir, filename)
+    test_data = lstm_data_util.get_data(test_data_seq, vocab, config)
+
+    preds_list, _, _ = predictions(test_data, model, config)
+    eval_util.dump_result(preds_list, metadata, test_data, root_dir, 'boundary_model.txt', config)
+
+if __name__ == "__main__":
+    prefix = 'prop' #'private-projects/propganda'
+    root_dir = os.path.join(os.path.expanduser("~"), prefix + '/exp')
+    data_dir = os.path.join(os.path.expanduser("~"), prefix + '/datasets')
+    glove_dir = os.path.join(os.path.expanduser("~"), 'dl_entity/glove.6B/glove.6B.100d.txt')
+    process_train(root_dir, data_dir, glove_dir)
+    #model_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp/dl_model_1579925959')
+    #process_eval(root_dir, model_dir, data_dir, 'test_phase0.jsonl')
+
diff --git a/NER_BERT/model_utils.py b/NER_BERT/model_utils.py
new file mode 100644
index 0000000..631c54d
--- /dev/null
+++ b/NER_BERT/model_utils.py
@@ -0,0 +1,99 @@
+import numpy as np
+import codecs
+import re
+from collections import Counter
+
+import const
+
+def init_lstm_wt(lstm):
+    for names in lstm._all_weights:
+        for name in names:
+            if name.startswith('weight_'):
+                wt = getattr(lstm, name)
+                drange = np.sqrt(6. / (np.sum(wt.size())))
+                wt.data.uniform_(-drange, drange)
+
+            elif name.startswith('bias_'):
+                # set forget bias to 1
+                bias = getattr(lstm, name)
+                n = bias.size(0)
+                start, end = n // 4, n // 2
+                bias.data.fill_(0.)
+                bias.data[start:end].fill_(1.)
+
+
+def init_linear_wt(linear):
+    drange = np.sqrt(6. / (np.sum(linear.weight.size())))
+    linear.weight.data.uniform_(-drange, drange)
+
+    if linear.bias is not None:
+        linear.bias.data.fill_(0.)
+
+
+def get_glove(glove_path):
+    print("Loading GLoVE vectors from file: {}".format(glove_path))
+    word_to_vector = {}
+
+    # go through glove vecs
+    with codecs.open(glove_path, 'r', 'utf-8') as fh:
+        for line in fh:
+            line = re.split('\s+', line.strip())
+            word = line[0]
+            vector = list(map(float, line[1:]))
+            word_to_vector[word] = vector
+
+    return word_to_vector
+
+def get_word_embd(config, glove_path, examples):
+    word_to_vector = get_glove(glove_path)
+
+    word_freq_map = Counter()
+    for tokens, tags in examples:
+        word_freq_map.update(tokens)
+
+    word_freq_map.update(word_to_vector.keys())
+    orig_tokens = set([w for w, ct in word_freq_map.most_common()])
+    orig_tokens = sorted(list(orig_tokens.union(word_to_vector.keys())))
+
+    id_to_word = const._START_VOCAB.copy()
+    id_to_word.extend(orig_tokens)
+
+    word_to_id = {v: k for k, v in enumerate(id_to_word)}
+
+    word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
+                                        size=(len(id_to_word), config.word_emdb_dim))
+    pretrained_init = 0
+    for wid, w in enumerate(id_to_word):
+        if w in word_to_vector:
+            word_emb_matrix[wid, :] = word_to_vector[w]
+            pretrained_init += 1
+
+    return word_emb_matrix, word_to_id, id_to_word
+
+def get_char_embd(config, id_to_word):
+    char_freq_map = Counter()
+    for w in id_to_word:
+        char_freq_map.update([c for c in w])
+
+    id_to_char = const._START_VOCAB.copy()
+    id_to_char.extend([c for c, ct in char_freq_map.most_common()])
+
+    char_to_id = {v: k for k, v in enumerate(id_to_char)}
+
+    char_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
+                                        size=(len(id_to_char), config.char_embd_dim))
+
+    return char_emb_matrix, char_to_id, id_to_char
+
+def get_random_embedding(vocab, config):
+    word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
+                                        size=(len(vocab['id2word']), config.word_emdb_dim))
+
+    char_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
+                                        size=(len(vocab['id2char']), config.char_embd_dim))
+
+    embd = {
+        'word': word_emb_matrix,
+        'char': char_emb_matrix
+    }
+    return embd
diff --git a/NER_BERT/transformer.py b/NER_BERT/transformer.py
new file mode 100644
index 0000000..a2ce657
--- /dev/null
+++ b/NER_BERT/transformer.py
@@ -0,0 +1,110 @@
+#Code is based on http://nlp.seas.harvard.edu/2018/04/03/attention.html
+
+from __future__ import unicode_literals, print_function, division
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+import math
+
+logging.basicConfig(level=logging.INFO)
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                             -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, num_head, d_model, dropout=0.1):
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % num_head == 0
+        self.d_k = d_model // num_head  #d_k == d_v
+        self.h = num_head
+
+        self.linear_key = nn.Linear(d_model, d_model)
+        self.linear_value = nn.Linear(d_model, d_model)
+        self.linear_query = nn.Linear(d_model, d_model)
+        self.linear_out = nn.Linear(d_model, d_model)
+
+        self.dropout = nn.Dropout(p=dropout)
+
+    def attention(self, query, key, value, mask, dropout=None):
+        d_k = query.size(-1)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+        scores = scores.masked_fill(mask == 0, -1e9)
+
+        p_attn = F.softmax(scores, dim=-1)
+        if dropout is not None:
+            p_attn = dropout(p_attn)
+        return torch.matmul(p_attn, value), p_attn
+
+    def forward(self, query, key, value, mask):
+        nbatches = query.size(0)
+        query = self.linear_query(query).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+        key = self.linear_key(key).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+        value = self.linear_value(value).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+
+        mask = mask.unsqueeze(1)
+        x, attn = self.attention(query, key, value, mask, dropout=self.dropout)
+        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
+        return self.linear_out(x)
+
+class AffineLayer(nn.Module):
+    def __init__(self, dropout, d_model, d_ff):
+        super(AffineLayer, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        return self.w_2(self.dropout(F.relu(self.w_1(x))))
+
+class EncoderLayer(nn.Module):
+    def __init__(self, num_head, dropout, d_model, d_ff):
+        super(EncoderLayer, self).__init__()
+
+        self.att_layer = MultiHeadedAttention(num_head, d_model, dropout)
+        self.norm_att = nn.LayerNorm(d_model)
+        self.dropout_att = nn.Dropout(dropout)
+
+        self.affine_layer = AffineLayer(dropout, d_model, d_ff)
+        self.norm_affine = nn.LayerNorm(d_model)
+        self.dropout_affine = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        x_att = self.norm_att(x*mask)
+        x_att = self.att_layer(x_att, x_att, x_att, mask)
+        x = x + self.dropout_att(x_att)
+
+        x_affine = self.norm_affine(x*mask)
+        x_affine = self.affine_layer(x_affine)
+        return x + self.dropout_affine(x_affine)
+
+class Encoder(nn.Module):
+    def __init__(self, N, num_head, dropout, d_model, d_ff):
+        super(Encoder, self).__init__()
+        self.position = PositionalEncoding(d_model, dropout)
+        self.layers = nn.ModuleList()
+        for _ in range(N):
+            self.layers.append(EncoderLayer(num_head, dropout, d_model, d_ff))
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, word_embed, mask):
+        x = self.position(word_embed)
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x*mask)
diff --git a/README.md b/README.md
index dafe84b..9cc25a9 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,10 @@
-# Sequence Prediction
+# Various BERT-CRF and BiLSTM-CRF based model for NER 
 
 ## TO-DO
 ### Datset
 - - [x] conll2003
 - - [ ] atis
+
 ### Neural NER
 - - [x] CharLSTM+WordLSTM+CRF: [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf)
   - - [x] Make a CoNLL-2003 batcher
@@ -12,6 +13,8 @@
   - - [x] Implement CharLSTM + WordLSTM + softmax
   - - [x] Implement CharLSTM + WordLSTM + CRF
 - - [x] Tranformer encoder + CRF
+- - [x] BERT encoder + CRF
+- - [x] pytorch JIT compilable Viterbi Decoder  https://github.com/atulkum/sequence_prediction/blob/master/NER_BERT/decoder.py#L9
 
 ### Slot Filling + intent prediciton
 - - [ ] [Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling](https://arxiv.org/abs/1609.01454)
diff --git a/neural_ner/model_lstm.py b/neural_ner/model_lstm.py
index f42d207..cfab61c 100644
--- a/neural_ner/model_lstm.py
+++ b/neural_ner/model_lstm.py
@@ -108,11 +108,10 @@ def forward(self, batch):
         h = self.tanh_layer(h)
         logits = self.hidden2tag(h)
         logits = logits.view(b, t_k, -1)
-
+        logits = F.log_softmax(logits, dim=2)
         return logits
 
     def neg_log_likelihood(self, logits, y, s_lens):
-        log_smx = F.log_softmax(logits, dim=2)
         loss = F.nll_loss(log_smx.transpose(1, 2), y, ignore_index=Constants.TAG_PAD_ID, reduction='none')
         loss = loss.sum(dim=1) / s_lens.float()
         loss = loss.mean()
diff --git a/transformer_models/dataset_roberta.py b/transformer_models/dataset_roberta.py
new file mode 100644
index 0000000..1a61c01
--- /dev/null
+++ b/transformer_models/dataset_roberta.py
@@ -0,0 +1,154 @@
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from transformers import RobertaTokenizer
+import pandas as pd
+from ast import literal_eval
+from torch.nn import CrossEntropyLoss
+
+tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+id2tag = ['O', 'B-toxic', 'I-toxic']
+tag2id = {v:k for k, v in enumerate(id2tag)}
+tag_pad_id = CrossEntropyLoss().ignore_index
+
+def encode_roberta(sentence):
+    sentence_tokens = [tokenizer.tokenize(sentence[0])] + \
+                      [tokenizer.tokenize(f' {t}') for t in sentence[1:]]
+    sentence_ids = [tokenizer.convert_tokens_to_ids(t) for t in sentence_tokens]
+    start_idx_mask = []
+    all_ids = []
+    for subwords in sentence_ids:
+        curr_mask = [1]
+        if len(subwords) > 1:
+            curr_mask += [0] * (len(subwords) - 1)
+        start_idx_mask.extend(curr_mask)
+        all_ids.extend(subwords)
+    special_token_mask = tokenizer.get_special_tokens_mask(all_ids)
+
+    prefix_offset = 0
+    while prefix_offset < len(special_token_mask) and special_token_mask[prefix_offset] == 1:
+        prefix_offset += 1
+    suffix_offset = len(special_token_mask) - len(start_idx_mask) - prefix_offset
+    start_idx_mask = [0] * prefix_offset + start_idx_mask + [0] * suffix_offset
+
+    sentence_inputs = tokenizer.prepare_for_model(all_ids, add_special_tokens=True)
+    input_ids = sentence_inputs["input_ids"]
+    attention_mask = sentence_inputs["attention_mask"]
+    #######
+    inputs = tokenizer(
+        text=' '.join(sentence),
+        add_special_tokens=True
+    )
+    assert inputs["input_ids"] == input_ids
+    assert inputs["attention_mask"] == attention_mask
+    #######
+    return input_ids, attention_mask, start_idx_mask
+
+def get_labels_tokens(orig_sentence, chunks):
+    curr = 0
+    labels = []
+    tokens = []
+    for s, e in chunks:
+        other_txt = orig_sentence[curr:s].split()
+        label_txt = orig_sentence[s:e + 1].split()
+        curr = e + 1
+        tokens.extend(other_txt)
+        labels.extend(['O'] * len(other_txt))
+
+        tokens.append(label_txt[0])
+        labels.append('B-toxic')
+        for k in range(1, len(label_txt)):
+            tokens.append(label_txt[k])
+            labels.append('I-toxic')
+    if curr < len(orig_sentence):
+        other_txt = orig_sentence[curr:].split()
+        tokens.extend(other_txt)
+        labels.extend(['O'] * len(other_txt))
+    return tokens, labels
+
+def get_chunks(span):
+    chunks = []
+    curr_start = None
+    for span_i, t in enumerate(span):
+        if span_i == 0 or curr_start is None:
+            curr_start = t
+        elif t > span[span_i - 1] + 1:
+            chunks.append((curr_start, span[span_i - 1]))
+            curr_start = t
+    if curr_start is not None:
+        chunks.append((curr_start, span[-1]))
+    return chunks
+
+def get_text_from_ids(input_ids):
+    return tokenizer.convert_tokens_to_string(
+        [tokenizer._convert_id_to_token(input_id) for input_id in input_ids])
+
+class SpanDataset(Dataset):
+    def __getitem__(self, n):
+        return self._features[n]
+
+    def __len__(self):
+        return len(self._features)
+
+    def __init__(self, phase):
+        self._phase = phase
+        self.init_dataset()
+
+    def init_dataset(self):
+        train = pd.read_csv("tsd_train.csv")
+        sentences = train['text']
+        if self._phase in {'train', 'dev'}:
+            spans = train.spans.apply(literal_eval)
+        max_seq_len = -1
+        max_token_len = -1
+        features = []
+        for i, orig_sentence in enumerate(sentences):
+            chunks = []
+            if self._phase in {'train', 'dev'}:
+                chunks = get_chunks(spans[i])
+
+            tokens, labels = get_labels_tokens(orig_sentence, chunks)
+            # roberta tokenization
+            input_ids, attention_mask, start_idx_mask = encode_roberta(tokens)
+            max_seq_len = max(max_seq_len, len(input_ids))
+            max_token_len = max(max_token_len, len(labels))
+            labels_ids = [tag2id[k] for k in labels]
+            padded_labels_ids = labels_ids + [tag_pad_id]*(200 - len(labels_ids))
+            datum = {
+                'input_ids': torch.LongTensor(input_ids),
+                'attention_mask': torch.LongTensor(attention_mask),
+                'start_idx_mask': torch.BoolTensor(start_idx_mask),
+                'labels': torch.LongTensor(labels_ids),
+                'padded_labels': torch.LongTensor(padded_labels_ids)
+            }
+            features.append(datum)
+        print(f'max_seq_len {max_seq_len} max_token_len {max_token_len}')
+        self._features = features
+
+def variable_collate_fn(batch):
+    batch_features = {}
+
+    batch_features['input_ids'] = pad_sequence([x['input_ids'] for x in batch],
+                                               batch_first=True,
+                                               padding_value=tokenizer.pad_token_id)
+    batch_features['attention_mask'] = pad_sequence([x['attention_mask'] for x in batch],
+                                               batch_first=True,
+                                               padding_value=0)
+    batch_features['start_idx_mask'] = pad_sequence([x['start_idx_mask'] for x in batch],
+                                               batch_first=True,
+                                               padding_value=0)
+    if 'labels' in batch[0]:
+        batch_features['labels'] = pad_sequence([x['labels'] for x in batch],
+                                               batch_first=True,
+                                               padding_value=tag_pad_id)
+        batch_features['padded_labels'] = pad_sequence([x['padded_labels'] for x in batch],
+                                                batch_first=True,
+                                                padding_value=tag_pad_id)
+    return batch_features
+
+if __name__ == '__main__':
+    data_iter = SpanDataset('dev')
+    for d in data_iter:
+        print(d)
+        break
diff --git a/transformer_models/model.py b/transformer_models/model.py
new file mode 100644
index 0000000..5fccdb4
--- /dev/null
+++ b/transformer_models/model.py
@@ -0,0 +1,168 @@
+import torch.nn as nn
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+from transformers import BertPreTrainedModel, RobertaModel, RobertaConfig
+import torch
+from torch.nn import CrossEntropyLoss
+
+tag_pad_id = CrossEntropyLoss().ignore_index
+
+class CRFBert(BertPreTrainedModel):
+    config_class = RobertaConfig
+    base_model_prefix = "roberta"
+    def __init__(self, config):
+        super(CRFBert, self).__init__(config)
+        num_tags = 3
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_tags)
+        self.init_weights()
+
+        #crf
+        self.start_tag, self.end_tag = num_tags, num_tags + 1
+        self.transitions = nn.Parameter(torch.Tensor(num_tags + 2, num_tags + 2))
+        nn.init.constant_(self.transitions, -np.log(num_tags))
+        self.transitions.data[self.end_tag, :] = -10000
+        self.transitions.data[:, self.start_tag] = -10000
+
+    def forward(self, input_ids, attention_mask, start_idx_mask, labels, **kwargs):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=True,
+            output_hidden_states=True,
+            return_dict=True
+        )
+        token_embedding = self.dropout(outputs.last_hidden_state)
+        #get start idx embedding of each tokens
+        start_idx_lens = start_idx_mask.sum(1).view(-1)
+        embd_selected = torch.masked_select(token_embedding,
+                            start_idx_mask.unsqueeze(2)).view(-1,token_embedding.size()[-1])
+        embd_split = torch.split(embd_selected, start_idx_lens.tolist())
+        embd_padded = pad_sequence(embd_split, batch_first=True, padding_value=0)
+        logits = self.classifier(embd_padded)
+        logits= F.log_softmax(logits, dim=-1)
+        mask = labels.ne(tag_pad_id)
+        loss = self.get_crf_loss(logits, labels, mask)
+        outputs = (loss,)
+        if not self.training:
+            sentence_score, pred_tag = self.viterbi_decode_batch(logits, mask)
+            pred_tag = nn.ConstantPad1d((0, 200 - pred_tag.size(1)), tag_pad_id)(pred_tag)
+            outputs += (pred_tag,)
+        return outputs
+
+    def viterbi_decode_batch(self, emissions, mask):
+        seq_len = emissions.shape[1]
+        options = dict(dtype=emissions.dtype, device=emissions.device)
+
+        log_prob = emissions[:, 0].clone()
+        log_prob += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
+
+        end_scores = log_prob + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+
+        best_scores_list = []
+        best_scores_list.append(end_scores.unsqueeze(1))
+
+        best_paths_0 = torch.Tensor().long().to(emissions.device)
+        best_paths_list = [best_paths_0]
+
+        for idx in range(1, seq_len):
+            broadcast_emissions = emissions[:, idx].unsqueeze(1)
+            broadcast_transmissions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0)
+            broadcast_log_prob = log_prob.unsqueeze(2)
+            score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob
+            max_scores, max_score_indices = torch.max(score, 1)
+            best_paths_list.append(max_score_indices.unsqueeze(1))
+            end_scores = max_scores + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+
+            best_scores_list.append(end_scores.unsqueeze(1))
+            log_prob = max_scores
+
+        best_scores = torch.cat(best_scores_list, 1).float()
+        best_paths = torch.cat(best_paths_list, 1)
+
+        max_scores, max_indices_from_scores = torch.max(best_scores, 2)
+
+        valid_index_tensor = torch.tensor(0, **options).long()
+        padding_tensor = torch.tensor(tag_pad_id, **options).long()
+
+        labels = max_indices_from_scores[:, seq_len - 1]
+        labels = torch.where(mask[:, seq_len - 1] != 1.0, padding_tensor, labels)
+        all_labels = labels.unsqueeze(1).long()
+
+        for idx in range(seq_len - 2, -1, -1):
+            indices_for_lookup = all_labels[:, -1].clone()
+            indices_for_lookup = torch.where(indices_for_lookup == tag_pad_id, valid_index_tensor,
+                                             indices_for_lookup)
+
+            indices_from_prev_pos = best_paths[:, idx, :].gather(1, indices_for_lookup.view(-1, 1).long()).squeeze(1)
+            indices_from_prev_pos = torch.where(mask[:, idx + 1] != 1.0, padding_tensor, indices_from_prev_pos)
+
+            indices_from_max_scores = max_indices_from_scores[:, idx]
+            indices_from_max_scores = torch.where(mask[:, idx + 1] == 1.0, padding_tensor, indices_from_max_scores)
+
+            labels = torch.where(indices_from_max_scores == tag_pad_id, indices_from_prev_pos,
+                                 indices_from_max_scores)
+
+            # Set to ignore_index if present state is not valid.
+            labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels)
+            all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)
+
+        last_tag_indices = mask.sum(1, dtype=torch.long) - 1
+        sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1)
+
+        return sentence_score, torch.flip(all_labels, [1])
+
+    def get_log_p_z(self, emissions, mask):
+        seq_len = emissions.shape[1]
+        log_alpha = emissions[:, 0].clone()
+        log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
+
+        for idx in range(1, seq_len):
+            broadcast_emissions = emissions[:, idx].unsqueeze(1)
+            broadcast_transitions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0)
+            broadcast_logprob = log_alpha.unsqueeze(2)
+            score = broadcast_logprob + broadcast_emissions + broadcast_transitions
+
+            score = torch.logsumexp(score, 1)
+            log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (1.0 - mask[:, idx].unsqueeze(1))
+
+        log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+        return torch.logsumexp(log_alpha.squeeze(1), 1)
+
+    def get_log_p_Y_X(self, emissions, mask, orig_tags):
+        seq_len = emissions.shape[1]
+        tags = orig_tags.clone()
+        tags[tags < 0] = 0
+
+        llh = self.transitions[self.start_tag, tags[:, 0]].unsqueeze(1)
+        llh += emissions[:, 0, :].gather(1, tags[:, 0].view(-1, 1)) * mask[:, 0].unsqueeze(1)
+
+        for idx in range(1, seq_len):
+            old_state, new_state = (
+                tags[:, idx - 1].view(-1, 1),
+                tags[:, idx].view(-1, 1),
+            )
+            emission_scores = emissions[:, idx, :].gather(1, new_state)
+            transition_scores = self.transitions[old_state, new_state]
+            llh += (emission_scores + transition_scores) * mask[:, idx].unsqueeze(1)
+
+        last_tag_indices = mask.sum(1, dtype=torch.long) - 1
+        last_tags = tags.gather(1, last_tag_indices.view(-1, 1))
+
+        llh += self.transitions[last_tags.squeeze(1), self.end_tag].unsqueeze(1)
+
+        return llh.squeeze(1)
+
+    def log_likelihood(self, emissions, tags, mask):
+        log_z = self.get_log_p_z(emissions, mask)
+        log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
+        return log_p_y_x - log_z
+
+    def get_crf_loss(self, logits, y, mask):
+        s_lens = mask.sum(1)
+        loss = -1 * self.log_likelihood(logits, y, mask.float())
+        loss = loss / s_lens.float()
+        loss = loss.mean()
+        return loss
diff --git a/transformer_models/train_roberta.py b/transformer_models/train_roberta.py
new file mode 100644
index 0000000..1be5ddc
--- /dev/null
+++ b/transformer_models/train_roberta.py
@@ -0,0 +1,62 @@
+import time
+import random
+import numpy as np
+
+import torch
+from transformers import Trainer, TrainingArguments
+from torch.utils.data import random_split
+
+from .dataset_roberta import SpanDataset, variable_collate_fn
+from .eval_utils import compute_metrics
+from .model import CRFBert
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available() > 0:
+        torch.cuda.manual_seed_all(seed)
+
+def train():
+    all_dataset = SpanDataset('train')
+    train_size = int(0.99 * len(all_dataset))
+    test_size = len(all_dataset) - train_size
+    train_dataset, eval_dataset = random_split(all_dataset, [train_size, test_size])
+
+    model = CRFBert.from_pretrained('roberta-base')
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f'Number of trainable parameter: {n_params}')
+
+    training_args = TrainingArguments(
+        output_dir=f'./results_{int(time.time())}',  # output directory
+        num_train_epochs=3,  # total # of training epochs
+        per_device_train_batch_size=8,  # batch size per device during training
+        per_device_eval_batch_size=8,  # batch size for evaluation
+        warmup_steps=500,  # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,  # strength of weight decay
+        logging_dir='./logs',  # directory for storing logs
+        save_total_limit=1,
+        seed=42,
+        label_names=["padded_labels"]
+    )
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        compute_metrics=compute_metrics,
+        data_collator=variable_collate_fn,
+    )
+    #trainer.train()
+    #trainer.save_model()
+    result = trainer.evaluate()
+    for d in data_iter:
+        print(d)
+
+if __name__ == '__main__':
+    set_seed(42)
+    train()
+
+
+