diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/NER_BERT/LICENSE b/NER_BERT/LICENSE new file mode 100644 index 0000000..642dede --- /dev/null +++ b/NER_BERT/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Atul Kumar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NER_BERT/README.md b/NER_BERT/README.md new file mode 100644 index 0000000..0f0b7d0 --- /dev/null +++ b/NER_BERT/README.md @@ -0,0 +1 @@ +# WIP: BERT vs BiLSTM NER diff --git a/NER_BERT/bert_data_util.py b/NER_BERT/bert_data_util.py new file mode 100644 index 0000000..3acc676 --- /dev/null +++ b/NER_BERT/bert_data_util.py @@ -0,0 +1,92 @@ +from __future__ import absolute_import, division, print_function + +import os + +from transformers import BertTokenizer + +import torch +import torch.nn as nn + +import const + +bert_tokenizer = BertTokenizer.from_pretrained(const.MODEL_TYPE, do_lower_case=True) + +def get_bert_data(examples, tag2id, config): + bert_data = [] + for orig_tokens, orig_tag in examples: + input_ids, label_ids, segment_ids, tokens = prepare_bert_input(orig_tokens, orig_tag, tag2id, config) + bert_data.append((input_ids, label_ids, segment_ids)) + return bert_data + +def prepare_bert_input(orig_tokens, orig_tag, tag2id, config): + tokens = [] + label_ids = [] + + assert len(orig_tag) == len(orig_tokens), orig_tag + orig_tokens + + for i, t in enumerate(orig_tokens): + label_t = tag2id[orig_tag[i]] + bert_tokens = bert_tokenizer.tokenize(t) + bert_tokens_len = len(bert_tokens) + if bert_tokens_len > 0: + tokens.extend(bert_tokens) + label_ids.append(label_t) + + # pad label if multiple tokens for a single word + if bert_tokens_len > 1: + label_ids.extend([const.label_pad_id] * (bert_tokens_len - 1)) + + assert len(tokens) == len(label_ids) + ###truncate large sequence### + tokens = tokens[:config.max_seq_length] + label_ids = label_ids[:config.max_seq_length] + ############################## + segment_ids = [const.sequence_a_segment_id] * len(tokens) + + tokens = [const.cls_token] + tokens + [const.sep_token] + + label_ids = [const.label_pad_id] + label_ids + [const.label_pad_id] + segment_ids = [const.cls_token_segment_id] + segment_ids + [const.sequence_a_segment_id] + input_ids = bert_tokenizer.convert_tokens_to_ids(tokens) + + assert len(input_ids) == len(label_ids) + assert len(input_ids) == len(segment_ids) + + input_ids = torch.tensor(input_ids).long() + label_ids = torch.tensor(label_ids).long() + segment_ids = torch.tensor(segment_ids).long() + return input_ids, label_ids, segment_ids, tokens + +def create_batch(train_data, batch_ids, is_cuda): + max_len = max([len(train_data[bi][0]) for bi in batch_ids]) + batch_input_ids = [] + batch_label_ids = [] + batch_segment_ids = [] + for bi in batch_ids: + input_ids, label_ids, segment_ids = train_data[bi] + pad_len = max_len - len(input_ids) + padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id) + batch_input_ids.append(padding_op(input_ids).unsqueeze(0)) + padding_op = nn.ConstantPad1d((0, pad_len), const.label_pad_id) + batch_label_ids.append(padding_op(label_ids).unsqueeze(0)) + padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_segment_id) + batch_segment_ids.append(padding_op(segment_ids).unsqueeze(0)) + + batch_input_ids = torch.cat(batch_input_ids) + batch_label_ids = torch.cat(batch_label_ids) + batch_segment_ids = torch.cat(batch_segment_ids) + + att_mask = batch_input_ids.ne(const.pad_token_id) + + if is_cuda: + batch_input_ids = batch_input_ids.cuda() + batch_label_ids = batch_label_ids.cuda() + batch_segment_ids = batch_segment_ids.cuda() + att_mask = att_mask.cuda() + + inputs = {'input_ids': batch_input_ids, + 'attention_mask': att_mask, + 'token_type_ids': batch_segment_ids, + 'labels': batch_label_ids} + return inputs + diff --git a/NER_BERT/const.py b/NER_BERT/const.py new file mode 100644 index 0000000..7624956 --- /dev/null +++ b/NER_BERT/const.py @@ -0,0 +1,44 @@ +from torch.nn import CrossEntropyLoss + +pad_token_id = 0 + +ENTITY_OTHER = 'O' +ENTITY_BEGIN = 'B-' +ENTITY_CONT = 'I-' +ENTITY_SINGLE = 'S-' +ENTITY_END = 'E-' + +UNK_INTENT = 'unknown' + +label_pad_id = CrossEntropyLoss().ignore_index +sep_token = "[SEP]" +cls_token = "[CLS]" +cls_token_segment_id = 0 +sequence_a_segment_id = 0 +pad_token_segment_id = 0 + +MODEL_TYPE = 'bert-base-uncased' + +ENTITY_NAMES=[ +"Appeal_to_Authority", +"Appeal_to_fear-prejudice", +"Black-and-White_Fallacy", +"Causal_Oversimplification", +"Doubt", +"Exaggeration,Minimisation", +"Flag-Waving", +"Loaded_Language", +"Name_Calling,Labeling", +"Obfuscation,Intentional_Vagueness,Confusion", +"Repetition", +"Slogans", +"Thought-terminating_Cliches", +"Whataboutism,Straw_Men,Red_Herring", +"Bandwagon,Reductio_ad_hitlerum" +] + +_UNK = "" +_PAD = "" +_START_VOCAB = [_UNK, _PAD] +UNK_ID = 0 +PAD_ID = 1 \ No newline at end of file diff --git a/NER_BERT/crf.py b/NER_BERT/crf.py new file mode 100644 index 0000000..d5ebe09 --- /dev/null +++ b/NER_BERT/crf.py @@ -0,0 +1,182 @@ +from __future__ import unicode_literals, print_function, division + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pad_sequence +import numpy as np + +is_cuda = torch.cuda.is_available() + +class CRF_Loss(nn.Module): + def __init__(self, tagset_size, pad_token_id, tag_pad_id): + super(CRF_Loss, self).__init__() + self.start_tag = tagset_size + self.end_tag = tagset_size + 1 + self.num_tags = tagset_size + 2 + self.tag_pad_id = tag_pad_id + self.pad_token_id = pad_token_id + + self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags)) + nn.init.constant_(self.transitions, -np.log(self.num_tags)) + + self.transitions.data[self.end_tag, :] = -10000 + self.transitions.data[:, self.start_tag] = -10000 + + def get_log_p_z(self, emissions, mask): + seq_len = emissions.shape[1] + log_alpha = emissions[:, 0].clone() + log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0) + + for idx in range(1, seq_len): + broadcast_emissions = emissions[:, idx].unsqueeze(1) + broadcast_transitions = self.transitions[ : self.start_tag, : self.start_tag].unsqueeze(0) + broadcast_logprob = log_alpha.unsqueeze(2) + score = broadcast_logprob + broadcast_emissions + broadcast_transitions + + score = torch.logsumexp(score, 1) + log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (1.0 - mask[:, idx].unsqueeze(1)) + + log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + return torch.logsumexp(log_alpha.squeeze(1), 1) + + def get_log_p_Y_X(self, emissions, mask, orig_tags): + seq_len = emissions.shape[1] + tags = orig_tags.clone() + tags[tags < 0] = 0 + + llh = self.transitions[self.start_tag, tags[:, 0]].unsqueeze(1) + llh += emissions[:, 0, :].gather(1, tags[:, 0].view(-1, 1)) * mask[:, 0].unsqueeze(1) + + for idx in range(1, seq_len): + old_state, new_state = ( + tags[:, idx - 1].view(-1, 1), + tags[:, idx].view(-1, 1), + ) + emission_scores = emissions[:, idx, :].gather(1, new_state) + transition_scores = self.transitions[old_state, new_state] + llh += (emission_scores + transition_scores) * mask[:, idx].unsqueeze(1) + + last_tag_indices = mask.sum(1, dtype=torch.long) - 1 + last_tags = tags.gather(1, last_tag_indices.view(-1, 1)) + + llh += self.transitions[last_tags.squeeze(1), self.end_tag].unsqueeze(1) + + return llh.squeeze(1) + + def log_likelihood(self, emissions, tags, mask): + log_z = self.get_log_p_z(emissions, mask) + log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags) + return log_p_y_x - log_z + + def get_crf_loss(self, logits, y): + mask = y.ne(self.tag_pad_id) + s_lens = mask.sum(1) + loss = -1 * self.log_likelihood(logits, y, mask.float()) + loss = loss / s_lens.float() + loss = loss.mean() + return loss + + def viterbi_decode(self, emissions, mask): + mask = mask.float() + b, seq_len, d = emissions.shape + log_prob = emissions[:, 0].clone() + log_prob += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0) + + end_scores = log_prob + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + + best_scores_list = [] + best_scores_list.append(end_scores.unsqueeze(1)) + + best_paths_0 = torch.Tensor().long() + if is_cuda: + best_paths_0 = best_paths_0.cuda() + best_paths_list = [best_paths_0] + + for idx in range(1, seq_len): + broadcast_emissions = emissions[:, idx].unsqueeze(1) + broadcast_transmissions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0) + broadcast_log_prob = log_prob.unsqueeze(2) + score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob + max_scores, max_score_indices = torch.max(score, 1) + best_paths_list.append(max_score_indices.unsqueeze(1)) + end_scores = max_scores + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + + best_scores_list.append(end_scores.unsqueeze(1)) + log_prob = max_scores + + best_scores = torch.cat(best_scores_list, 1).float() + best_paths = torch.cat(best_paths_list, 1) + + max_scores, max_indices_from_scores = torch.max(best_scores, 2) + + valid_index_tensor = torch.tensor(0).long() + padding_tensor = torch.tensor(self.tag_pad_id).long() + + if is_cuda: + valid_index_tensor = valid_index_tensor.cuda() + padding_tensor = padding_tensor.cuda() + #alternative to where + #curr_mask = mask[:, seq_len - 1].float() + #labels = max_indices_from_scores[:, seq_len - 1] * curr_mask + torch.logical_not(curr_mask) * padding_tensor + + labels = max_indices_from_scores[:, seq_len - 1] + labels = torch.where(mask[:, seq_len - 1] != 1.0, padding_tensor, labels) + all_labels = labels.unsqueeze(1).long() + ##### + labels_score = max_scores[:, seq_len - 1] + all_labels_score = labels_score.unsqueeze(1) + #### + for idx in range(seq_len - 2, -1, -1): + indices_for_lookup = all_labels[:, -1].clone() + indices_for_lookup = torch.where(indices_for_lookup == self.tag_pad_id, + valid_index_tensor, + indices_for_lookup) + + indices_from_prev_pos = best_paths[:, idx, :].gather(1, indices_for_lookup.view(-1, 1).long()).squeeze(1) + indices_from_prev_pos = torch.where(mask[:, idx + 1] != 1.0, padding_tensor, indices_from_prev_pos) + + indices_from_max_scores = max_indices_from_scores[:, idx] + indices_from_max_scores = torch.where(mask[:, idx + 1] == 1.0, padding_tensor, indices_from_max_scores) + + labels = torch.where(indices_from_max_scores == self.tag_pad_id, + indices_from_prev_pos, + indices_from_max_scores) + # Set to ignore_index if present state is not valid. + labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels) + all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1) + ###### + labels_score = max_scores[:, idx] + all_labels_score = torch.cat((all_labels_score, labels_score.view(-1, 1)), 1) + #### + #think about squeezing this score between 0 and 1 + last_tag_indices = mask.sum(1, dtype=torch.long) - 1 + sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1) + all_labels = torch.flip(all_labels, [1]) + all_labels_score = torch.flip(all_labels_score, [1]) + + return sentence_score, all_labels, all_labels_score + + def structural_perceptron_loss(self, emissions, tags): + mask = tags.ne(self.tag_pad_id).float() + + best_scores, pred = self.viterbi_decode(emissions, mask, is_cuda) + log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags) + + delta = torch.sum(tags.ne(pred).float()*mask, 1) + + margin_loss = torch.clamp(best_scores + delta - log_p_y_x, min=0.0) + return margin_loss + + def bert_output2crf_input(self, logits_ner, labels): + mask = labels.ne(self.tag_pad_id) + lens = mask.sum(1).view(-1).tolist() + + logits_selected = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1]) + logits_split = torch.split(logits_selected, lens) + logits_padded = pad_sequence(logits_split, batch_first=True, padding_value=self.pad_token_id) + + labels_selected = torch.masked_select(labels, mask) + labels_split = torch.split(labels_selected, lens) + labels_padded = pad_sequence(labels_split, batch_first=True, padding_value=self.tag_pad_id) + + return logits_padded, labels_padded diff --git a/NER_BERT/dataset_utils.py b/NER_BERT/dataset_utils.py new file mode 100644 index 0000000..fc45444 --- /dev/null +++ b/NER_BERT/dataset_utils.py @@ -0,0 +1,320 @@ +from __future__ import absolute_import, division, print_function +import glob +import os.path +import codecs +from pathlib import Path +import os +from collections import defaultdict +from sklearn.model_selection import train_test_split +import json +import re + +import const + +def tokenize(w): + q = w + contr_dict = {"’": "'", + "i\'m": "i am", + "won\'t": " will not", + "\'s": " s", + "\'ll": " will", + "\'ve": " have", + "n\'t": " not", + "\'re": " are", + "\'d": " would", + "y'all": " all of you"} + + for contr in contr_dict: + q = q.replace(contr, contr_dict[contr]) + + q_arr = re.findall(r"[\w]+[']*[\w]+|[\w]+|[.,!?;:]", q, re.UNICODE) + + q = ' '.join(q_arr) + q = re.sub('[0-9]{5,}', '#####', q) + q = re.sub('[0-9]{4}', '####', q) + q = re.sub('[0-9]{3}', '###', q) + q = re.sub('[0-9]{2}', '##', q) + + q = q.strip().lower().split() + if len(q) == 0: + return [w] + return q + +def tagid2tag_seq(tag_vocab, tagid_seq): + return [[tag_vocab[t] for t in tag_seq] for tag_seq in tagid_seq] + +def get_tokenize_tag(tagging_type, t, n): + if n == 1: + return [t] + if t == const.ENTITY_OTHER or t.startswith(const.ENTITY_CONT) or tagging_type == 'B': + return [t] * n + + name = t[2:] + start = t[:2] + tags = [] + if start == const.ENTITY_BEGIN: + tags.append(t) + for i in range(1, n): + tags.append(const.ENTITY_CONT + name) + return tags + + if tagging_type == "BIOES": + if start == const.ENTITY_SINGLE: + tags.append(t) + for i in range(1, n-1): + tags.append(const.ENTITY_CONT + name) + tags.append(const.ENTITY_END + name) + elif start == const.ENTITY_END: + for i in range(n-1): + tags.append(const.ENTITY_CONT + name) + tags.append(t) + + return tags + +def get_tag_vocab(config): + tags = [const.ENTITY_OTHER] + if config.tagging_type == 'B': + tags.append(const.ENTITY_BEGIN) + else: + tags.extend([const.ENTITY_BEGIN + t for t in const.ENTITY_NAMES]) + tags.extend([const.ENTITY_CONT + t for t in const.ENTITY_NAMES]) + + if config.tagging_type == "BIOES": + tags.extend([const.ENTITY_END + t for t in const.ENTITY_NAMES]) + tags.extend([const.ENTITY_SINGLE + t for t in const.ENTITY_NAMES]) + + return tags + +def read_articles_from_file_list(folder_name, file_pattern="*.txt"): + file_list = glob.glob(os.path.join(folder_name, file_pattern)) + articles = [] + for filename in sorted(file_list): + article_id = os.path.basename(filename).split(".")[0][7:] + with codecs.open(filename, "r", encoding="utf8") as f: + articles.append((article_id, f.read())) + return articles + + +def parse_label(label_path): + labels = [] + f = Path(label_path) + + if not f.exists(): + return labels + + for line in open(label_path): + parts = line.strip().split('\t') + labels.append({'start': int(parts[2]), 'end': int(parts[3]), 'type': parts[1]}) + + labels.sort(key=lambda s: (s['start'], -s['end'])) + return labels + +def clean_text(article): + sentences = article.split('\n') + end = -1 + res = [] + for sentence in sentences: + start = end + 1 + end = start + len(sentence) # length of sequence + if sentence != "": # if not empty line + res.append({'start': start, 'end': end, 'sentence': sentence}) + return res + +def get_overlapping_entities(entities): + etree = defaultdict(set) + # inefficeint but clean + for a in range(len(entities)): + ea = entities[a] + for b in range(a + 1, len(entities)): + eb = entities[b] + overlap_start = max(ea['start'], eb['start']) + overlap_end = min(ea['end'], eb['end']) + if overlap_start <= overlap_end: + # if eb['end'] > ea['end']: + # print('partial', ea, eb) + etree[a].add(b) + etree[b].add(a) + assert all([(a in etree) for k in etree for a in etree[k]]) + # assert all([len(etree[k]) == 1 for k in etree]) + return etree + +def get_per_sentence_entity(entities, ds, de): + d_entities = [] + for a in range(len(entities)): + ea = entities[a] + overlap_start = max(ea['start'], ds) + overlap_end = min(ea['end'], de) + if overlap_start <= overlap_end: + d_entities.append({ + 'type': ea['type'], + 'start': overlap_start - ds, + 'end': overlap_end - ds + }) + d_entities.sort(key=lambda s: (s['start'], -s['end'])) + return d_entities + + +def get_non_overlapping_seq(etree, n): + # get non overlapping entity sequence + ex = set() + if len(etree) > 0: + for k in etree: + ex.add(tuple([i for i in range(n) if i not in etree[k]])) + else: + ex.add(tuple(range(n))) + return ex + + +def get_tag_seq(n, name, tagging_type): + tags = [] + for i in range(n): + tag = None + if tagging_type == 'IOBES': + if i == 0: + if n == 1: + tag = const.ENTITY_SINGLE + else: + tag = const.ENTITY_BEGIN + elif i == n - 1: + tag = const.ENTITY_END + else: + tag = const.ENTITY_CONT + elif tagging_type == 'IOB': + if i == 0: + tag = const.ENTITY_BEGIN + else: + tag = const.ENTITY_CONT + elif tagging_type == 'B': + tag = const.ENTITY_BEGIN + else: + raise Exception('tagging_type no recognized') + assert tag is not None + if tagging_type == 'B': + tags.append(tag) + else: + tags.append(tag + name) + + return tags + +def encode_tokens_json(e, sentence, d_entities): + tokens = [] + labels = [] + curr = 0 + for a in sorted(e): + ea = d_entities[a] + pre = sentence[curr:ea['start']].split() + span = sentence[ea['start']:ea['end']].split() + + tokens.extend(pre) + start = len(tokens) + tokens.extend(span) + end = len(tokens) + labels.append({ 'type': ea['type'], + 'start': start, + 'end' : end}) + curr = ea['end'] + + pre = sentence[curr:].split() + tokens.extend(pre) + + return {'tokens': tokens, + 'labels': labels} + +def get_data_train_dev(root_dir, filename, config): + examples = [] + for line in codecs.open(os.path.join(root_dir, filename), "r", encoding="utf8"): + datum = json.loads(line) + orig_tokens = datum['tokens'] + assert len(orig_tokens) > 0, line + orig_tags = [const.ENTITY_OTHER]* len(datum['tokens']) + for e in datum['labels']: + orig_tags[e['start']:e['end']] = get_tag_seq(e['end'] - e['start'], e['type'], config.tagging_type) + + tokens = [] + tags = [] + for i, w in enumerate(orig_tokens): + w_i = tokenize(w) + t_i = orig_tags[i] + tokens.extend(w_i) + tokenized_tags = get_tokenize_tag(config.tagging_type, t_i, len(w_i)) + tags.extend(tokenized_tags) + + assert len(w_i) == len(tokenized_tags), f'{w_i} => {tokenized_tags}' + assert len(tokens) == len(tags) and len(tokens) > 0, f'{tokens} => {tags}' + examples.append((tokens, tags)) + return examples + + +def get_data_test(root_dir, filename): + examples = [] + metadata = [] + + for line in codecs.open(os.path.join(root_dir, filename), "r", encoding="utf8"): + datum = json.loads(line) + orig_tokens = datum['tokens'] + tokens = [] + ignore_mapping = [] + data_tokens = [] + for i, w in enumerate(orig_tokens): + w_i = tokenize(w) + tokens.extend(w_i) + ignore = [0]*len(w_i) + ignore[0] = 1 + ignore_mapping.extend(ignore) + data_tokens.extend([w]*len(w_i)) + + tags = [const.ENTITY_OTHER] * len(tokens) + examples.append((tokens, tags)) + metadata.append({'article_id': datum['article_id'], + 'start_sentence': datum['start_sentence'], + 'end_sentence': datum['end_sentence'], + 'ignore_mapping': ignore_mapping, + 'data_tokens': data_tokens + }) + return examples, metadata + + +def dump_data(root_dir): + train_data = read_articles_from_file_list(os.path.join(root_dir, 'train-articles')) + label_dir = os.path.join(root_dir, 'train-labels-task2-technique-classification') + + examples = [] + for article_id, line in train_data: + entities = parse_label(os.path.join(label_dir, f'article{article_id}.task2-TC.labels')) + + for d in clean_text(line): + d_entities = get_per_sentence_entity(entities, d['start'], d['end']) + etree = get_overlapping_entities(d_entities) + ex = get_non_overlapping_seq(etree, len(d_entities)) + + sentence = d['sentence'] + for e in ex: + datum = encode_tokens_json(e, sentence, d_entities) + datum['article_id'] = article_id + examples.append(json.dumps(datum)) + + train_data, dev_data = train_test_split(examples, test_size=0.2) + with codecs.open(os.path.join(root_dir, 'train.jsonl'), "w", encoding="utf8") as f: + f.write('\n'.join(train_data) + '\n') + with codecs.open(os.path.join(root_dir, 'dev.jsonl'), "w", encoding="utf8") as f: + f.write('\n'.join(dev_data) + '\n') + +def dump_data_test(root_dir): + examples = [] + dev_data = read_articles_from_file_list(os.path.join(root_dir, 'dev-articles')) + + for article_id, line in dev_data: + for d in clean_text(line): + datum = {'tokens':d['sentence'].split(), + 'start_sentence': d['start'], + 'end_sentence': d['end'], + 'article_id':article_id} + examples.append(json.dumps(datum)) + + with codecs.open(os.path.join(root_dir, 'test_phase0.jsonl'), "w", encoding="utf8") as f: + f.write('\n'.join(examples) + '\n') + +if __name__ == "__main__": + data_dir = os.path.join(os.path.expanduser("~"), 'prop/datasets') + dump_data(data_dir) + dump_data_test(data_dir) diff --git a/NER_BERT/decoder.py b/NER_BERT/decoder.py new file mode 100644 index 0000000..80c1545 --- /dev/null +++ b/NER_BERT/decoder.py @@ -0,0 +1,111 @@ +import torch +import numpy as np +import torch.nn.functional as F +from torch.nn.utils.rnn import pad_sequence + +#reference https://github.com/allenai/allennlp/blob/a7265c04078964ea2b80a78fc3967bde8d16072d/allennlp/nn/util.py#L403 + +@torch.jit.script +def viterbi_decode_single_jit(tag_sequence, transition_matrix): + top_k = 1 + sequence_length, num_tags = tag_sequence.size() + num_tags = num_tags + 2 + + zero_sentinel = torch.zeros(1, num_tags) + extra_tags_sentinel = torch.ones(sequence_length, 2) * -10000 + tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1) + tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0) + + path_indices = torch.zeros(num_tags, dtype=torch.long, device=tag_sequence.device).unsqueeze(0) + path_scores = tag_sequence[0, :].unsqueeze(0) + + for t in range(1, tag_sequence.size(0)): + summed_potentials = path_scores.unsqueeze(2) + transition_matrix + summed_potentials = summed_potentials.view(-1, num_tags) + + scores, paths = torch.topk(summed_potentials, k=top_k, dim=0) + + path_scores = tag_sequence[t, :].unsqueeze(0) + scores + path_indices = torch.cat([path_indices, paths], 0) + + path_indices = path_indices[1:] + path_scores_v = path_scores.view(-1) + viterbi_scores, best_paths = torch.topk(path_scores_v, k=top_k, dim=0) + + n_paths_indices = path_indices.size(0) + + viterbi_paths = torch.zeros(sequence_length, dtype=torch.long, device=tag_sequence.device).unsqueeze(0) + tag_scores = torch.zeros(sequence_length, device=tag_sequence.device).unsqueeze(0) + for i in range(top_k): + viterbi_path = best_paths[0].unsqueeze(0) + + for k in range(n_paths_indices): + t_rev = n_paths_indices - k - 1 + backward_timestep = path_indices[t_rev, :] + tag_id = torch.index_select(backward_timestep.view(-1), 0, viterbi_path[-1]) + viterbi_path = torch.cat([viterbi_path, tag_id], -1) + + viterbi_path = viterbi_path.flip(0) + viterbi_path = viterbi_path % num_tags + viterbi_path = viterbi_path[1:-1] + viterbi_paths = torch.cat([viterbi_paths, viterbi_path.unsqueeze(0)], 0) + + tag_score = torch.gather(tag_sequence[1:-1], 1, viterbi_path.unsqueeze(-1)).view(-1) + tag_scores = torch.cat([tag_scores, tag_score.unsqueeze(0)], 0) + viterbi_paths = viterbi_paths[1:] + tag_scores = tag_scores[1:] + return viterbi_paths, tag_scores.exp(), viterbi_scores.exp() + +def predict_ner_single_jit(logits_ner, labels, transition_matrix, tag_pad_id): + mask = labels.ne(tag_pad_id) + logits_padded = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1]) + return viterbi_decode_single_jit(logits_padded, transition_matrix) + + +####### +def viterbi_decode_single_python(e, t): + num_tags = len(e[0]) + seq_len = len(e) + start_tag_id = num_tags + end_tag_id = num_tags + 1 + + dp_links = [] + dp = [0.] * num_tags + curr_dp_links = [] + for j in range(num_tags): + dp[j] = t[start_tag_id, j] + e[0][j] + curr_dp_links.append(-1) + dp_links.append(curr_dp_links) + + for i in range(1, seq_len): + new_dp = [] + curr_dp_links = [] + for j in range(num_tags): + all_candidates = [np.logaddexp(t[k, j] + e[i][j], dp[k]) for k in range(num_tags)] + max_k = max(range(num_tags), key=lambda i: all_candidates[i]) + new_dp.append(all_candidates[max_k]) + curr_dp_links.append(max_k) + dp = new_dp + dp_links.append(curr_dp_links) + + all_candidates = [np.logaddexp(t[k, end_tag_id], dp[k]) for k in range(num_tags)] + max_k = max(range(num_tags), key=lambda i: all_candidates[i]) + sentence_score = all_candidates[max_k] + + all_labels = [max_k] + all_labels_score = [t[max_k, end_tag_id]] + + for i in range(seq_len - 1, 0, -1): + curr_k = dp_links[i][max_k] + all_labels.append(curr_k) + all_labels_score.append(t[curr_k, max_k] + e[i][max_k]) + max_k = curr_k + + return sentence_score, all_labels[::-1], all_labels_score[::-1] + +def predict_ner_single_python(logits_ner, labels, transitions, tag_pad_id): + mask = labels.ne(tag_pad_id) + logits_ner_padded = torch.masked_select(logits_ner, mask.unsqueeze(2)).view(-1, logits_ner.size()[-1]) + logits_ner_padded_lsf = F.log_softmax(logits_ner_padded, dim=-1) + score_sentence, pred_tags, score_tags = viterbi_decode_single_python(logits_ner_padded_lsf.cpu().data.numpy(), transitions.cpu().data.numpy()) + return [pred_tags], [np.exp(score_tags)], [np.exp(score_sentence)] diff --git a/NER_BERT/eval_util.py b/NER_BERT/eval_util.py new file mode 100644 index 0000000..d3e8a0f --- /dev/null +++ b/NER_BERT/eval_util.py @@ -0,0 +1,74 @@ +from __future__ import absolute_import, division, print_function + +from seqeval.metrics import precision_score, recall_score, f1_score + +import const +import os +import dataset_utils + +# eval +def get_chunks(seq, ignore_I_mismatch=False): + chunks = [] + chunk_type, chunk_start = None, None + for i, tok in enumerate(seq): + if tok == const.ENTITY_OTHER: + if chunk_type is not None: + chunks.append((chunk_type, chunk_start, i)) + chunk_type, chunk_start = None, None + else: + curr_chunk_type = tok[2:] + chunk_prefix = tok[:2] + if chunk_prefix == const.ENTITY_BEGIN: + if chunk_type is not None: + chunks.append((chunk_type, chunk_start, i)) + + chunk_type, chunk_start = curr_chunk_type, i + elif chunk_prefix == const.ENTITY_CONT and not ignore_I_mismatch: + if chunk_type is not None and chunk_type != curr_chunk_type: + chunks.append((chunk_type, chunk_start, i)) + chunk_type, chunk_start = None, None + # end condition + if chunk_type is not None: + chunks.append((chunk_type, chunk_start, len(seq))) + + chunks = list(set(chunks)) + chunks.sort(key=lambda s: s[0]) + return chunks + +def evaluate(gold_label_list, preds_list, config): + tag_vocab = dataset_utils.get_tag_vocab(config) + gold_label_list = dataset_utils.tagid2tag_seq(tag_vocab, gold_label_list) + preds_list = dataset_utils.tagid2tag_seq(tag_vocab, preds_list) + + results = { + "precision": precision_score(gold_label_list, preds_list), + "recall": recall_score(gold_label_list, preds_list), + "f1": f1_score(gold_label_list, preds_list) + } + return results + + +def dump_result(preds_list, metadata, test_data, root_dir, filename, config): + tag_vocab = dataset_utils.get_tag_vocab(config) + preds_list = dataset_utils.tagid2tag_seq(tag_vocab, preds_list) + + tc = os.path.join(root_dir, 'tc_' + filename) + si = os.path.join(root_dir, 'si_' + filename) + + with open(tc, "w") as tc_writer, open(si, "w") as si_writer: + for i, t in enumerate(preds_list): + article = metadata[i] + sen_start = article['start_sentence'] + article_id = article['article_id'] + ignore_mapping = article['ignore_mapping'] + data_tokens = article['data_tokens'] + + for type, start, end in get_chunks(t): + #adjust start end + orig_tokens = [data_tokens[i] for i in range(start) if ignore_mapping[i] == 1] + start_boundary = len(' '.join(orig_tokens)) + sen_start + orig_tokens = [data_tokens[i] for i in range(start, end) if ignore_mapping[i] == 1] + end_boundary = start_boundary + len(' '.join(orig_tokens)) + si_writer.write(f'{article_id}\t{start_boundary}\t{end_boundary}\n') + tc_writer.write(f'{article_id}\t{type}\t{start_boundary}\t{end_boundary}\n') + diff --git a/NER_BERT/lstm_data_util.py b/NER_BERT/lstm_data_util.py new file mode 100644 index 0000000..c460b8b --- /dev/null +++ b/NER_BERT/lstm_data_util.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn +import const +import dataset_utils + +def get_data(examples, vocab, config): + tag_vocab = dataset_utils.get_tag_vocab(config) + tag2id = {t: i for i, t in enumerate(tag_vocab)} + + all_data = [] + for orig_tokens, orig_tag in examples: + input_ids, char_ids, label_ids = prepare_input(orig_tokens, orig_tag, tag2id, vocab, config) + all_data.append((input_ids, char_ids, label_ids)) + return all_data + +def create_char_batch(char_id_seq): + batch_char_ids = [] + max_len = max([len(char_ids) for char_ids in char_id_seq]) + for char_ids in char_id_seq: + pad_len = max_len - len(char_ids) + padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id) + batch_char_ids.append(padding_op(char_ids).unsqueeze(0)) + + batch_char_ids = torch.cat(batch_char_ids) + mask = batch_char_ids.ne(const.pad_token_id) + + return batch_char_ids, mask + +def prepare_input(orig_tokens, orig_tag, tag2id, vocab, config): + input_ids = [] + label_ids = [] + char_id_seq = [] + + assert len(orig_tag) == len(orig_tokens), orig_tag + orig_tokens + + for i, w in enumerate(orig_tokens): + label_id = tag2id[orig_tag[i]] + w_id = vocab['word2id'][w] if w in vocab['word2id'] else const.UNK_ID + input_ids.append(w_id) + label_ids.append(label_id) + w_char_ids = [vocab['char2id'][c] if c in vocab['char2id'] else const.UNK_ID for c in w] + w_char_ids = torch.tensor(w_char_ids).long() + + char_id_seq.append(w_char_ids) + + char_ids = create_char_batch(char_id_seq) + input_ids = torch.tensor(input_ids).long() + label_ids = torch.tensor(label_ids).long() + return input_ids, char_ids, label_ids + +def create_batch(train_data, batch_ids, is_cuda): + max_len = max([len(train_data[bi][0]) for bi in batch_ids]) + batch_input_ids = [] + batch_char_ids = [] + batch_label_ids = [] + + for bi in batch_ids: + input_ids, char_ids, label_ids = train_data[bi] + batch_char_ids.append(char_ids) + pad_len = max_len - len(input_ids) + padding_op = nn.ConstantPad1d((0, pad_len), const.pad_token_id) + batch_input_ids.append(padding_op(input_ids).unsqueeze(0)) + padding_op = nn.ConstantPad1d((0, pad_len), const.label_pad_id) + batch_label_ids.append(padding_op(label_ids).unsqueeze(0)) + + batch_input_ids = torch.cat(batch_input_ids) + batch_label_ids = torch.cat(batch_label_ids) + + att_mask = batch_input_ids.ne(const.pad_token_id) + + if is_cuda: + batch_input_ids = batch_input_ids.cuda() + batch_label_ids = batch_label_ids.cuda() + att_mask = att_mask.cuda() + + inputs = {'word_ids': batch_input_ids, + 'mask': att_mask, + 'char_ids':batch_char_ids, + 'labels': batch_label_ids} + return inputs diff --git a/NER_BERT/model_bert_train.py b/NER_BERT/model_bert_train.py new file mode 100644 index 0000000..12006d3 --- /dev/null +++ b/NER_BERT/model_bert_train.py @@ -0,0 +1,280 @@ +from __future__ import absolute_import, division, print_function + +import os.path + +import os +import torch +import torch.nn as nn +import numpy as np +import random +import time +import json + +import const +import eval_util +import bert_data_util +import dataset_utils + +from torch.nn import CrossEntropyLoss +from transformers import BertModel, BertConfig +from transformers import AdamW, get_linear_schedule_with_warmup + +is_cuda = torch.cuda.is_available() + +class Config(object): + def __init__(self): + self.num_epoch = 10 + self.weight_decay = 0.0 # 1e-8 + self.batch_size = 8 + self.eval_batch_size = 8 + self.max_grad_norm = 1.0 + self.learning_rate = 5e-5 + self.adam_epsilon = 1e-8 + self.print_interval = 1000 * self.batch_size + self.warmup_steps = 0.0 + self.max_seq_length = 100 # =32 - 2 + self.seed = 42 + self.tagging_type = 'B' + +config = Config() + +#####set seed +random.seed(config.seed) +np.random.seed(config.seed) +torch.manual_seed(config.seed) +if is_cuda > 0: + torch.cuda.manual_seed_all(config.seed) +#####set seed end + +class MyBertForTokenClassification(nn.Module): + def __init__(self, num_tags): + super(MyBertForTokenClassification, self).__init__() + self.bert = BertModel.from_pretrained(const.MODEL_TYPE) + + self.config = BertConfig.from_pretrained(const.MODEL_TYPE) + self.config.num_tags = num_tags + + self.dropout_ner = nn.Dropout(self.config.hidden_dropout_prob) + self.classifier_ner = nn.Linear(self.config.hidden_size, num_tags) + + self.classifier_ner.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + self.classifier_ner.bias.data.zero_() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None + ): + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + # NER + sequence_output = outputs[0] + sequence_output = self.dropout_ner(sequence_output) + logits_ner = self.classifier_ner(sequence_output) + + outputs = (logits_ner,) + outputs[2:] # add hidden states and attention if they are here + return outputs # (loss), scores_ner, (hidden_states), (attentions) + + def save_pretrained(self, save_directory): + self.config.save_pretrained(save_directory) + output_model_file = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), output_model_file) + + def get_loss_greedy(self, logits_ner, labels, attention_mask): + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + active_loss = attention_mask.view(-1) == 1 + active_logits = logits_ner.view(-1, self.model_config.num_tags)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def get_loss_crf(self, logits_ner, labels, attention_mask): + logits_padded, labels_padded = self.crf.bert_output2crf_input(logits_ner, labels) + loss = self.crf.get_crf_loss(logits_padded, labels_padded) + return loss + + def predict_ner_greedy(self, logits_ner, labels): + logits_ner = logits_ner.softmax(dim=2) + score_tags, pred_tags = logits_ner.max(dim=2) + return -1.0, pred_tags, score_tags, labels + + def predict_ner_viterbi(self, logits_ner, labels): + logits_padded, labels_padded = self.crf.bert_output2crf_input(logits_ner, labels) + mask = labels_padded.ne(self.crf.tag_pad_id) + score_sentence, pred_tags, score_tags = self.crf.viterbi_decode(logits_padded, mask) + return score_sentence, pred_tags, score_tags, labels_padded + +def get_optimizer(model, config, t_total): + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": config.weight_decay}, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, + num_training_steps=t_total) + + return optimizer, scheduler + + +def predictions(dev_data, model, config): + data_size = len(dev_data) + ids = np.arange(data_size) + eval_loss = 0 + model.eval() + + gold_label_list = [] + preds_list = [] + + for i in range(0, data_size, config.eval_batch_size): + batch_ids = ids[i:i + config.eval_batch_size] + + inputs = bert_data_util.create_batch(dev_data, batch_ids, is_cuda) + outputs_t = model(**inputs) + loss_t, scores_ner_t = outputs_t[:2] + + eval_loss += loss_t.item() + max_value_tag_t, pred_tag_t = torch.max(scores_ner_t, dim=2) + + pred_tag = pred_tag_t.cpu().data.numpy() + gold_tag = inputs['labels'].cpu().data.numpy() + + for k, bi in enumerate(batch_ids): + s_len = len(dev_data[bi][0]) + predict_list = [] + gold_list = [] + for j in range(s_len): + if gold_tag[k][j] == const.label_pad_id: + continue + predict_list.append(pred_tag[k][j]) + gold_list.append(gold_tag[k][j]) + + gold_label_list.append(gold_list) + preds_list.append(predict_list) + eval_loss /= data_size + return preds_list, gold_label_list, eval_loss + +def train(output_root_dir, train_data_seq, dev_data_seq): + tag_vocab = dataset_utils.get_tag_vocab(config) + tag2id = {t: i for i, t in enumerate(tag_vocab)} + + train_data = bert_data_util.get_bert_data(train_data_seq, tag2id, config) + dev_data = bert_data_util.get_bert_data(dev_data_seq, tag2id, config) + + model = MyBertForTokenClassification(num_tags=len(tag2id)) + if is_cuda: + model = model.cuda() + + data_size = len(train_data) + num_batch = np.ceil(data_size / config.batch_size) + t_total = config.num_epoch * num_batch + + optimizer, scheduler = get_optimizer(model, config, t_total) + + exp_loss = None + global_step = 0 + best_dev_f1 = 0 + model.zero_grad() + ids = np.arange(data_size) + for epoch in range(config.num_epoch): + np.random.shuffle(ids) + for i in range(0, data_size, config.batch_size): + batch_ids = ids[i:i + config.batch_size] + + model.train() + inputs = bert_data_util.create_batch(train_data, batch_ids, is_cuda) + outputs = model(**inputs) + loss = outputs[0] + + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) + optimizer.step() + scheduler.step() + model.zero_grad() + global_step += 1 + + exp_loss = 0.99 * exp_loss + 0.01 * loss.item() if exp_loss else loss.item() + + if global_step > 0 and global_step % config.print_interval == 0: + print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True) + + preds_list, gold_label_list, eval_loss = predictions(dev_data, model, config) + results = eval_util.evaluate(preds_list, gold_label_list, tag_vocab) + print(f'{global_step}/{t_total} NER: p/r/f1 {results["precision"]:.5f}/{results["recall"]:.5f}/{results["f1"]:.5f}', flush=True) + + f1 = results['f1'] + if f1 > best_dev_f1: + # output_dir = os.path.join(output_root_dir, 'checkpoint-{}'.format(epoch)) + output_dir = os.path.join(output_root_dir, 'checkpoint') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + print(f"Saving model checkpoint to {output_dir}", flush=True) + + model.save_pretrained(output_dir) + bert_data_util.bert_tokenizer.save_pretrained(output_dir) + + with open(os.path.join(output_dir, "training_config.json"), 'w') as fout: + json.dump(vars(config), fout) + + print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True) + +def get_model(model_dir, num_tag): + model = MyBertForTokenClassification(num_tags=num_tag) + + #load model + save_directory = os.path.join(root_dir, model_dir + '/checkpoint') + model_file_path = os.path.join(save_directory, "pytorch_model.bin") + print(f'reading model from {model_file_path}') + state_dict = torch.load(model_file_path, map_location=lambda storage, location: storage) + model.eval() + model.load_state_dict(state_dict, strict=False) + + if is_cuda: + model = model.cuda() + return model + +def process_train(root_dir, data_dir): + output_root_dir = os.path.join(root_dir, f'dl_model_{int(time.time())}') + if not os.path.exists(output_root_dir): + os.makedirs(output_root_dir) + print(f'model out dir {output_root_dir}', flush=True) + + train_data_seq = dataset_utils.get_data_train_dev(data_dir, 'train.jsonl', config) + dev_data_seq = dataset_utils.get_data_train_dev(data_dir, 'dev.jsonl', config) + + train(output_root_dir, train_data_seq, dev_data_seq) + +def process_eval(root_dir, model_dir, data_dir, filename): + tag_vocab = dataset_utils.get_tag_vocab(config) + tag2id = {t: i for i, t in enumerate(tag_vocab)} + + model = get_model(model_dir, len(tag2id)) + + test_data, metadata = dataset_utils.get_data_test(data_dir, filename) + test_data_bert = bert_data_util.get_bert_data(test_data, tag2id, config) + + preds_list, _, _ = predictions(test_data_bert, model, config) + eval_util.dump_result(preds_list, metadata, tag_vocab, test_data, root_dir, 'boundary_model.txt') + +if __name__ == "__main__": + root_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp') + data_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/datasets') + #process_train(root_dir, data_dir) + model_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp/dl_model_1579925959') + process_eval(root_dir, model_dir, data_dir, 'test_phase0.jsonl') + diff --git a/NER_BERT/model_lstm.py b/NER_BERT/model_lstm.py new file mode 100644 index 0000000..179dbc2 --- /dev/null +++ b/NER_BERT/model_lstm.py @@ -0,0 +1,155 @@ +from __future__ import unicode_literals, print_function, division + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from torch.nn import Conv1d, ReLU + +from crf import CRF_Loss +import model_utils + +class CHAR_CONV(torch.nn.Module): + def __init__(self, + embedding_dim, + num_filters, + ngram_filter_sizes=(2, 3, 4, 5)): + super(CHAR_CONV, self).__init__() + + self.convolution_layers = torch.nn.ModuleList() + for ngram_size in ngram_filter_sizes: + conv_maxpool = torch.nn.ModuleList() + conv_maxpool.extend([Conv1d( + in_channels=embedding_dim, + out_channels=num_filters, + kernel_size=ngram_size), + ReLU(), + torch.nn.MaxPool1d(kernel_size=num_filters)]) + + self.convolution_layers.append(conv_maxpool) + + def forward(self, tokens, mask): + tokens = tokens * mask.unsqueeze(-1).float() + tokens = torch.transpose(tokens, 1, 2) + + filter_outputs = [] + for conv_maxpool in self.convolution_layers: + filter_outputs.append(conv_maxpool(tokens)) + + maxpool_output = torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0] + + return maxpool_output + +class NER_SOFTMAX_CHAR(nn.Module): + def __init__(self, emb_matrix, config, num_tags): + super(NER_SOFTMAX_CHAR, self).__init__() + + embd_vector = torch.from_numpy(emb_matrix['word']).float() + self.word_embeds = nn.Embedding.from_pretrained(embd_vector, freeze=False) + + embd_vector = torch.from_numpy(emb_matrix['char']).float() + self.char_embeds = nn.Embedding.from_pretrained(embd_vector, freeze=False) + + self.lstm_char = nn.LSTM(self.char_embeds.embedding_dim, + config.char_lstm_dim, + num_layers=1, bidirectional=True, batch_first=True) + + input_size = self.word_embeds.embedding_dim + config.char_lstm_dim * 2 + + self.lstm = nn.LSTM(input_size, + config.word_lstm_dim, + num_layers=1, bidirectional=True, batch_first=True) + + self.dropout = nn.Dropout(config.dropout_rate) + self.hidden_layer = nn.Linear(config.word_lstm_dim * 2, config.word_lstm_dim) + self.tanh_layer = torch.nn.Tanh() + + self.hidden2tag = nn.Linear(config.word_lstm_dim, num_tags) + + self.config = config + + model_utils.init_lstm_wt(self.lstm_char) + model_utils.init_lstm_wt(self.lstm) + model_utils.init_linear_wt(self.hidden_layer) + model_utils.init_linear_wt(self.hidden2tag) + + def forward(self, word_ids, mask, char_ids): + lengths = mask.sum(1, dtype=torch.long) + + max_length = torch.max(lengths) + char_emb = [] + word_embed = self.word_embeds(word_ids) + for chars, char_mask in char_ids: + char_len = char_mask.sum(1, dtype=torch.long) + seq_embed = self.char_embeds(chars) + seq_lengths, sort_idx = torch.sort(char_len, descending=True) + _, unsort_idx = torch.sort(sort_idx) + seq_embed = seq_embed[sort_idx] + packed = pack_padded_sequence(seq_embed, seq_lengths, batch_first=True) + output, hidden = self.lstm_char(packed) + lstm_feats, _ = pad_packed_sequence(output, batch_first=True) + lstm_feats = lstm_feats.contiguous() + b, t_k, d = list(lstm_feats.size()) + + seq_rep = lstm_feats.view(b, t_k, 2, -1) #0 is fwd and 1 is bwd + + last_idx = char_len - 1 + seq_rep_fwd = seq_rep[unsort_idx, 0, 0] + seq_rep_bwd = seq_rep[unsort_idx, last_idx, 1] + + seq_out = torch.cat([seq_rep_fwd, seq_rep_bwd], 1) + # fill up the dummy char embedding for padding + seq_out = F.pad(seq_out, (0, 0, 0, max_length - seq_out.size(0))) + char_emb.append(seq_out.unsqueeze(0)) + + char_emb = torch.cat(char_emb, 0) #b x n x c_dim + + word_embed = torch.cat([char_emb, word_embed], 2) + word_embed = self.dropout(word_embed) + + lengths = lengths.view(-1).tolist() + packed = pack_padded_sequence(word_embed, lengths, batch_first=True, enforce_sorted=False) + output, hidden = self.lstm(packed) + + lstm_feats, _ = pad_packed_sequence(output, batch_first=True) # h dim = B x t_k x n + lstm_feats = lstm_feats.contiguous() + + b, t_k, d = list(lstm_feats.size()) + + h = self.hidden_layer(lstm_feats.view(-1, d)) + h = self.tanh_layer(h) + logits = self.hidden2tag(h) + logits = logits.view(b, t_k, -1) + + return logits + + +class NER_SOFTMAX_CHAR_CRF(nn.Module): + def __init__(self, emb_matrix, config, tag_pad_id, num_tags): + super(NER_SOFTMAX_CHAR_CRF, self).__init__() + self.featurizer = NER_SOFTMAX_CHAR(emb_matrix, config, num_tags) + self.crf = CRF_Loss(num_tags, config, tag_pad_id) + self.config = config + + def forward(self, word_ids, mask, char_ids, labels=None): + output = self.featurizer(word_ids, mask, char_ids) + if labels is not None: + loss = self.get_loss(output, labels, mask) + output = (loss, output) + return output + + def get_loss(self, logits, y, mask): + if self.config.is_structural_perceptron_loss: + loss = self.crf.structural_perceptron_loss(logits, y) + else: + loss = -1 * self.crf.log_likelihood(logits, y) + + s_lens = mask.sum(1, dtype=torch.long) + + loss = loss / s_lens.float() + loss = loss.mean() + return loss + + def predict(self, emissions, mask): + best_scores, pred = self.crf.viterbi_decode_batch(emissions, mask) + return best_scores, pred diff --git a/NER_BERT/model_lstm_train.py b/NER_BERT/model_lstm_train.py new file mode 100644 index 0000000..6d36b1c --- /dev/null +++ b/NER_BERT/model_lstm_train.py @@ -0,0 +1,258 @@ +from __future__ import absolute_import, division, print_function + +import os.path + +import os +import torch +import torch.nn as nn +import numpy as np +import random +import time +import json +import codecs + +import const +import eval_util +import bert_data_util +import dataset_utils +import model_lstm +import model_utils +import lstm_data_util + +from torch.optim import Adam +from torch.optim.lr_scheduler import LambdaLR + +is_cuda = torch.cuda.is_available() + +class Config(object): + def __init__(self): + self.num_epoch = 10 + self.weight_decay = 0.0 # 1e-8 + self.batch_size = 8 + self.eval_batch_size = 8 + self.max_grad_norm = 1.0 + self.learning_rate = 5e-5 + self.adam_epsilon = 1e-8 + self.print_interval = 1000 * self.batch_size + self.warmup_steps = 0.0 + self.max_seq_length = 100 # =32 - 2 + self.seed = 42 + self.tagging_type = 'B' + self.word_emdb_dim = 100 + self.word_lstm_dim = 100 + self.char_embd_dim = 30 + self.char_lstm_dim = 64 + self.dropout_rate = 0.15 + self.is_structural_perceptron_loss = False + +config = Config() + +#####set seed +random.seed(config.seed) +np.random.seed(config.seed) +torch.manual_seed(config.seed) +if is_cuda > 0: + torch.cuda.manual_seed_all(config.seed) +#####set seed end +def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): + """ Create a schedule with a learning rate that decreases linearly after + linearly increasing during a warmup period. + """ + + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + return max( + 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) + ) + + return LambdaLR(optimizer, lr_lambda, last_epoch) + +def get_optimizer(model, config, t_total): + optimizer = Adam(model.parameters(), amsgrad=True) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, + num_training_steps=t_total) + return optimizer, scheduler + + +def predictions(dev_data, model, config): + data_size = len(dev_data) + ids = np.arange(data_size) + eval_loss = 0 + model.eval() + + gold_label_list = [] + preds_list = [] + + for i in range(0, data_size, config.eval_batch_size): + batch_ids = ids[i:i + config.eval_batch_size] + + inputs = lstm_data_util.create_batch(dev_data, batch_ids, is_cuda) + outputs_t = model(**inputs) + loss_t, scores_ner_t = outputs_t[:2] + best_scores, pred_tag_t = model.predict(scores_ner_t, inputs['mask']) + + eval_loss += loss_t.item() + + pred_tag = pred_tag_t.cpu().data.numpy() + gold_tag = inputs['labels'].cpu().data.numpy() + + for k, bi in enumerate(batch_ids): + s_len = len(dev_data[bi][0]) + predict_list = [] + gold_list = [] + for j in range(s_len): + if gold_tag[k][j] == const.label_pad_id: + continue + predict_list.append(pred_tag[k][j]) + gold_list.append(gold_tag[k][j]) + + gold_label_list.append(gold_list) + preds_list.append(predict_list) + eval_loss /= data_size + return preds_list, gold_label_list, eval_loss + +def train(output_root_dir, embd, train_data, dev_data): + num_tags = len(dataset_utils.get_tag_vocab(config)) + model = model_lstm.NER_SOFTMAX_CHAR_CRF(embd, config, const.label_pad_id, num_tags) + + if is_cuda: + model = model.cuda() + + data_size = len(train_data) + num_batch = np.ceil(data_size / config.batch_size) + t_total = config.num_epoch * num_batch + + optimizer, scheduler = get_optimizer(model, config, t_total) + + exp_loss = None + global_step = 0 + best_dev_f1 = 0 + model.zero_grad() + ids = np.arange(data_size) + for epoch in range(config.num_epoch): + np.random.shuffle(ids) + for i in range(0, data_size, config.batch_size): + batch_ids = ids[i:i + config.batch_size] + + model.train() + inputs = lstm_data_util.create_batch(train_data, batch_ids, is_cuda) + outputs = model(**inputs) + loss = outputs[0] + + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) + optimizer.step() + scheduler.step() + model.zero_grad() + global_step += 1 + + exp_loss = 0.99 * exp_loss + 0.01 * loss.item() if exp_loss else loss.item() + + if global_step > 0 and global_step % config.print_interval == 0: + print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True) + + preds_list, gold_label_list, eval_loss = predictions(dev_data, model, config) + results = eval_util.evaluate(preds_list, gold_label_list, config) + print(f'{global_step}/{t_total} NER: p/r/f1 {results["precision"]:.5f}/{results["recall"]:.5f}/{results["f1"]:.5f}', flush=True) + + f1 = results['f1'] + if f1 > best_dev_f1: + # output_dir = os.path.join(output_root_dir, 'checkpoint-{}'.format(epoch)) + output_dir = os.path.join(output_root_dir, 'checkpoint') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + print(f"Saving model checkpoint to {output_dir}", flush=True) + + model.save_pretrained(output_dir) + bert_data_util.bert_tokenizer.save_pretrained(output_dir) + + with open(os.path.join(output_dir, "training_config.json"), 'w') as fout: + json.dump(vars(config), fout) + + print(f'{global_step} / {t_total} train loss: {exp_loss} lr: {scheduler.get_lr()[0]}', flush=True) + + +def process_train(root_dir, data_dir, glove_path): + output_root_dir = os.path.join(root_dir, f'dl_model_{int(time.time())}') + if not os.path.exists(output_root_dir): + os.makedirs(output_root_dir) + print(f'model out dir {output_root_dir}', flush=True) + + train_data_seq = dataset_utils.get_data_train_dev(data_dir, 'train.jsonl', config) + dev_data_seq = dataset_utils.get_data_train_dev(data_dir, 'dev.jsonl', config) + + word_emb_matrix, word2id, id2word = model_utils.get_word_embd(config, glove_path, train_data_seq) + char_emb_matrix, char2id, id2char = model_utils.get_char_embd(config, id2word) + + vocab = { + 'word2id': word2id, + 'id2word': id2word, + 'char2id': char2id, + 'id2char': id2char + } + embd = { + 'word': word_emb_matrix, + 'char': char_emb_matrix + } + with codecs.open(os.path.join(output_root_dir, 'word.vocab'), "w", encoding="utf8") as f: + f.write('\n'.join(id2word) + '\n') + with codecs.open(os.path.join(output_root_dir, 'char.vocab'), "w", encoding="utf8") as f: + f.write('\n'.join(id2char) + '\n') + + train_data = lstm_data_util.get_data(train_data_seq, vocab, config) + dev_data = lstm_data_util.get_data(dev_data_seq, vocab, config) + + train(output_root_dir, embd, train_data, dev_data) + + +def get_model(model_dir, vocab, num_tag): + embd = model_utils.get_random_embedding(vocab, config) + model = model_lstm.NER_SOFTMAX_CHAR_CRF(embd, config, const.label_pad_id, num_tag) + + #load model + save_directory = os.path.join(root_dir, model_dir + '/checkpoint') + model_file_path = os.path.join(save_directory, "pytorch_model.bin") + print(f'reading model from {model_file_path}') + state_dict = torch.load(model_file_path, map_location=lambda storage, location: storage) + model.eval() + model.load_state_dict(state_dict, strict=False) + + if is_cuda: + model = model.cuda() + return model + +def process_eval(root_dir, model_dir, data_dir, filename): + tag_vocab = dataset_utils.get_tag_vocab(config) + tag2id = {t: i for i, t in enumerate(tag_vocab)} + + with codecs.open(os.path.join(model_dir, 'word.vocab'), "r", encoding="utf8") as f: + id2word = f.readlines().split('\n') + word2id = {v: k for k, v in enumerate(id2word)} + with codecs.open(os.path.join(model_dir, 'char.vocab'), "r", encoding="utf8") as f: + id2char = f.readlines().split('\n') + char2id = {v: k for k, v in enumerate(id2char)} + vocab = { + 'word2id': word2id, + 'id2word': id2word, + 'char2id': char2id, + 'id2char': id2char + } + model = get_model(model_dir, vocab, len(tag2id)) + + test_data_seq, metadata = dataset_utils.get_data_test(data_dir, filename) + test_data = lstm_data_util.get_data(test_data_seq, vocab, config) + + preds_list, _, _ = predictions(test_data, model, config) + eval_util.dump_result(preds_list, metadata, test_data, root_dir, 'boundary_model.txt', config) + +if __name__ == "__main__": + prefix = 'prop' #'private-projects/propganda' + root_dir = os.path.join(os.path.expanduser("~"), prefix + '/exp') + data_dir = os.path.join(os.path.expanduser("~"), prefix + '/datasets') + glove_dir = os.path.join(os.path.expanduser("~"), 'dl_entity/glove.6B/glove.6B.100d.txt') + process_train(root_dir, data_dir, glove_dir) + #model_dir = os.path.join(os.path.expanduser("~"), 'private-projects/propganda/exp/dl_model_1579925959') + #process_eval(root_dir, model_dir, data_dir, 'test_phase0.jsonl') + diff --git a/NER_BERT/model_utils.py b/NER_BERT/model_utils.py new file mode 100644 index 0000000..631c54d --- /dev/null +++ b/NER_BERT/model_utils.py @@ -0,0 +1,99 @@ +import numpy as np +import codecs +import re +from collections import Counter + +import const + +def init_lstm_wt(lstm): + for names in lstm._all_weights: + for name in names: + if name.startswith('weight_'): + wt = getattr(lstm, name) + drange = np.sqrt(6. / (np.sum(wt.size()))) + wt.data.uniform_(-drange, drange) + + elif name.startswith('bias_'): + # set forget bias to 1 + bias = getattr(lstm, name) + n = bias.size(0) + start, end = n // 4, n // 2 + bias.data.fill_(0.) + bias.data[start:end].fill_(1.) + + +def init_linear_wt(linear): + drange = np.sqrt(6. / (np.sum(linear.weight.size()))) + linear.weight.data.uniform_(-drange, drange) + + if linear.bias is not None: + linear.bias.data.fill_(0.) + + +def get_glove(glove_path): + print("Loading GLoVE vectors from file: {}".format(glove_path)) + word_to_vector = {} + + # go through glove vecs + with codecs.open(glove_path, 'r', 'utf-8') as fh: + for line in fh: + line = re.split('\s+', line.strip()) + word = line[0] + vector = list(map(float, line[1:])) + word_to_vector[word] = vector + + return word_to_vector + +def get_word_embd(config, glove_path, examples): + word_to_vector = get_glove(glove_path) + + word_freq_map = Counter() + for tokens, tags in examples: + word_freq_map.update(tokens) + + word_freq_map.update(word_to_vector.keys()) + orig_tokens = set([w for w, ct in word_freq_map.most_common()]) + orig_tokens = sorted(list(orig_tokens.union(word_to_vector.keys()))) + + id_to_word = const._START_VOCAB.copy() + id_to_word.extend(orig_tokens) + + word_to_id = {v: k for k, v in enumerate(id_to_word)} + + word_emb_matrix = np.random.uniform(low=-1.0, high=1.0, + size=(len(id_to_word), config.word_emdb_dim)) + pretrained_init = 0 + for wid, w in enumerate(id_to_word): + if w in word_to_vector: + word_emb_matrix[wid, :] = word_to_vector[w] + pretrained_init += 1 + + return word_emb_matrix, word_to_id, id_to_word + +def get_char_embd(config, id_to_word): + char_freq_map = Counter() + for w in id_to_word: + char_freq_map.update([c for c in w]) + + id_to_char = const._START_VOCAB.copy() + id_to_char.extend([c for c, ct in char_freq_map.most_common()]) + + char_to_id = {v: k for k, v in enumerate(id_to_char)} + + char_emb_matrix = np.random.uniform(low=-1.0, high=1.0, + size=(len(id_to_char), config.char_embd_dim)) + + return char_emb_matrix, char_to_id, id_to_char + +def get_random_embedding(vocab, config): + word_emb_matrix = np.random.uniform(low=-1.0, high=1.0, + size=(len(vocab['id2word']), config.word_emdb_dim)) + + char_emb_matrix = np.random.uniform(low=-1.0, high=1.0, + size=(len(vocab['id2char']), config.char_embd_dim)) + + embd = { + 'word': word_emb_matrix, + 'char': char_emb_matrix + } + return embd diff --git a/NER_BERT/transformer.py b/NER_BERT/transformer.py new file mode 100644 index 0000000..a2ce657 --- /dev/null +++ b/NER_BERT/transformer.py @@ -0,0 +1,110 @@ +#Code is based on http://nlp.seas.harvard.edu/2018/04/03/attention.html + +from __future__ import unicode_literals, print_function, division + +import torch +import torch.nn as nn +import torch.nn.functional as F +import logging +import math + +logging.basicConfig(level=logging.INFO) + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * + -(math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :x.size(1)] + return self.dropout(x) + +class MultiHeadedAttention(nn.Module): + def __init__(self, num_head, d_model, dropout=0.1): + super(MultiHeadedAttention, self).__init__() + assert d_model % num_head == 0 + self.d_k = d_model // num_head #d_k == d_v + self.h = num_head + + self.linear_key = nn.Linear(d_model, d_model) + self.linear_value = nn.Linear(d_model, d_model) + self.linear_query = nn.Linear(d_model, d_model) + self.linear_out = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(p=dropout) + + def attention(self, query, key, value, mask, dropout=None): + d_k = query.size(-1) + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) + scores = scores.masked_fill(mask == 0, -1e9) + + p_attn = F.softmax(scores, dim=-1) + if dropout is not None: + p_attn = dropout(p_attn) + return torch.matmul(p_attn, value), p_attn + + def forward(self, query, key, value, mask): + nbatches = query.size(0) + query = self.linear_query(query).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) + key = self.linear_key(key).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) + value = self.linear_value(value).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) + + mask = mask.unsqueeze(1) + x, attn = self.attention(query, key, value, mask, dropout=self.dropout) + x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) + return self.linear_out(x) + +class AffineLayer(nn.Module): + def __init__(self, dropout, d_model, d_ff): + super(AffineLayer, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + +class EncoderLayer(nn.Module): + def __init__(self, num_head, dropout, d_model, d_ff): + super(EncoderLayer, self).__init__() + + self.att_layer = MultiHeadedAttention(num_head, d_model, dropout) + self.norm_att = nn.LayerNorm(d_model) + self.dropout_att = nn.Dropout(dropout) + + self.affine_layer = AffineLayer(dropout, d_model, d_ff) + self.norm_affine = nn.LayerNorm(d_model) + self.dropout_affine = nn.Dropout(dropout) + + def forward(self, x, mask): + x_att = self.norm_att(x*mask) + x_att = self.att_layer(x_att, x_att, x_att, mask) + x = x + self.dropout_att(x_att) + + x_affine = self.norm_affine(x*mask) + x_affine = self.affine_layer(x_affine) + return x + self.dropout_affine(x_affine) + +class Encoder(nn.Module): + def __init__(self, N, num_head, dropout, d_model, d_ff): + super(Encoder, self).__init__() + self.position = PositionalEncoding(d_model, dropout) + self.layers = nn.ModuleList() + for _ in range(N): + self.layers.append(EncoderLayer(num_head, dropout, d_model, d_ff)) + self.norm = nn.LayerNorm(d_model) + + def forward(self, word_embed, mask): + x = self.position(word_embed) + for layer in self.layers: + x = layer(x, mask) + return self.norm(x*mask) diff --git a/README.md b/README.md index dafe84b..9cc25a9 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ -# Sequence Prediction +# Various BERT-CRF and BiLSTM-CRF based model for NER ## TO-DO ### Datset - - [x] conll2003 - - [ ] atis + ### Neural NER - - [x] CharLSTM+WordLSTM+CRF: [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf) - - [x] Make a CoNLL-2003 batcher @@ -12,6 +13,8 @@ - - [x] Implement CharLSTM + WordLSTM + softmax - - [x] Implement CharLSTM + WordLSTM + CRF - - [x] Tranformer encoder + CRF +- - [x] BERT encoder + CRF +- - [x] pytorch JIT compilable Viterbi Decoder https://github.com/atulkum/sequence_prediction/blob/master/NER_BERT/decoder.py#L9 ### Slot Filling + intent prediciton - - [ ] [Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling](https://arxiv.org/abs/1609.01454) diff --git a/neural_ner/model_lstm.py b/neural_ner/model_lstm.py index f42d207..cfab61c 100644 --- a/neural_ner/model_lstm.py +++ b/neural_ner/model_lstm.py @@ -108,11 +108,10 @@ def forward(self, batch): h = self.tanh_layer(h) logits = self.hidden2tag(h) logits = logits.view(b, t_k, -1) - + logits = F.log_softmax(logits, dim=2) return logits def neg_log_likelihood(self, logits, y, s_lens): - log_smx = F.log_softmax(logits, dim=2) loss = F.nll_loss(log_smx.transpose(1, 2), y, ignore_index=Constants.TAG_PAD_ID, reduction='none') loss = loss.sum(dim=1) / s_lens.float() loss = loss.mean() diff --git a/transformer_models/dataset_roberta.py b/transformer_models/dataset_roberta.py new file mode 100644 index 0000000..1a61c01 --- /dev/null +++ b/transformer_models/dataset_roberta.py @@ -0,0 +1,154 @@ + +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import Dataset +from transformers import RobertaTokenizer +import pandas as pd +from ast import literal_eval +from torch.nn import CrossEntropyLoss + +tokenizer = RobertaTokenizer.from_pretrained('roberta-base') +id2tag = ['O', 'B-toxic', 'I-toxic'] +tag2id = {v:k for k, v in enumerate(id2tag)} +tag_pad_id = CrossEntropyLoss().ignore_index + +def encode_roberta(sentence): + sentence_tokens = [tokenizer.tokenize(sentence[0])] + \ + [tokenizer.tokenize(f' {t}') for t in sentence[1:]] + sentence_ids = [tokenizer.convert_tokens_to_ids(t) for t in sentence_tokens] + start_idx_mask = [] + all_ids = [] + for subwords in sentence_ids: + curr_mask = [1] + if len(subwords) > 1: + curr_mask += [0] * (len(subwords) - 1) + start_idx_mask.extend(curr_mask) + all_ids.extend(subwords) + special_token_mask = tokenizer.get_special_tokens_mask(all_ids) + + prefix_offset = 0 + while prefix_offset < len(special_token_mask) and special_token_mask[prefix_offset] == 1: + prefix_offset += 1 + suffix_offset = len(special_token_mask) - len(start_idx_mask) - prefix_offset + start_idx_mask = [0] * prefix_offset + start_idx_mask + [0] * suffix_offset + + sentence_inputs = tokenizer.prepare_for_model(all_ids, add_special_tokens=True) + input_ids = sentence_inputs["input_ids"] + attention_mask = sentence_inputs["attention_mask"] + ####### + inputs = tokenizer( + text=' '.join(sentence), + add_special_tokens=True + ) + assert inputs["input_ids"] == input_ids + assert inputs["attention_mask"] == attention_mask + ####### + return input_ids, attention_mask, start_idx_mask + +def get_labels_tokens(orig_sentence, chunks): + curr = 0 + labels = [] + tokens = [] + for s, e in chunks: + other_txt = orig_sentence[curr:s].split() + label_txt = orig_sentence[s:e + 1].split() + curr = e + 1 + tokens.extend(other_txt) + labels.extend(['O'] * len(other_txt)) + + tokens.append(label_txt[0]) + labels.append('B-toxic') + for k in range(1, len(label_txt)): + tokens.append(label_txt[k]) + labels.append('I-toxic') + if curr < len(orig_sentence): + other_txt = orig_sentence[curr:].split() + tokens.extend(other_txt) + labels.extend(['O'] * len(other_txt)) + return tokens, labels + +def get_chunks(span): + chunks = [] + curr_start = None + for span_i, t in enumerate(span): + if span_i == 0 or curr_start is None: + curr_start = t + elif t > span[span_i - 1] + 1: + chunks.append((curr_start, span[span_i - 1])) + curr_start = t + if curr_start is not None: + chunks.append((curr_start, span[-1])) + return chunks + +def get_text_from_ids(input_ids): + return tokenizer.convert_tokens_to_string( + [tokenizer._convert_id_to_token(input_id) for input_id in input_ids]) + +class SpanDataset(Dataset): + def __getitem__(self, n): + return self._features[n] + + def __len__(self): + return len(self._features) + + def __init__(self, phase): + self._phase = phase + self.init_dataset() + + def init_dataset(self): + train = pd.read_csv("tsd_train.csv") + sentences = train['text'] + if self._phase in {'train', 'dev'}: + spans = train.spans.apply(literal_eval) + max_seq_len = -1 + max_token_len = -1 + features = [] + for i, orig_sentence in enumerate(sentences): + chunks = [] + if self._phase in {'train', 'dev'}: + chunks = get_chunks(spans[i]) + + tokens, labels = get_labels_tokens(orig_sentence, chunks) + # roberta tokenization + input_ids, attention_mask, start_idx_mask = encode_roberta(tokens) + max_seq_len = max(max_seq_len, len(input_ids)) + max_token_len = max(max_token_len, len(labels)) + labels_ids = [tag2id[k] for k in labels] + padded_labels_ids = labels_ids + [tag_pad_id]*(200 - len(labels_ids)) + datum = { + 'input_ids': torch.LongTensor(input_ids), + 'attention_mask': torch.LongTensor(attention_mask), + 'start_idx_mask': torch.BoolTensor(start_idx_mask), + 'labels': torch.LongTensor(labels_ids), + 'padded_labels': torch.LongTensor(padded_labels_ids) + } + features.append(datum) + print(f'max_seq_len {max_seq_len} max_token_len {max_token_len}') + self._features = features + +def variable_collate_fn(batch): + batch_features = {} + + batch_features['input_ids'] = pad_sequence([x['input_ids'] for x in batch], + batch_first=True, + padding_value=tokenizer.pad_token_id) + batch_features['attention_mask'] = pad_sequence([x['attention_mask'] for x in batch], + batch_first=True, + padding_value=0) + batch_features['start_idx_mask'] = pad_sequence([x['start_idx_mask'] for x in batch], + batch_first=True, + padding_value=0) + if 'labels' in batch[0]: + batch_features['labels'] = pad_sequence([x['labels'] for x in batch], + batch_first=True, + padding_value=tag_pad_id) + batch_features['padded_labels'] = pad_sequence([x['padded_labels'] for x in batch], + batch_first=True, + padding_value=tag_pad_id) + return batch_features + +if __name__ == '__main__': + data_iter = SpanDataset('dev') + for d in data_iter: + print(d) + break diff --git a/transformer_models/model.py b/transformer_models/model.py new file mode 100644 index 0000000..5fccdb4 --- /dev/null +++ b/transformer_models/model.py @@ -0,0 +1,168 @@ +import torch.nn as nn +import numpy as np +from torch.nn.utils.rnn import pad_sequence +import torch.nn.functional as F +from transformers import BertPreTrainedModel, RobertaModel, RobertaConfig +import torch +from torch.nn import CrossEntropyLoss + +tag_pad_id = CrossEntropyLoss().ignore_index + +class CRFBert(BertPreTrainedModel): + config_class = RobertaConfig + base_model_prefix = "roberta" + def __init__(self, config): + super(CRFBert, self).__init__(config) + num_tags = 3 + self.roberta = RobertaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_tags) + self.init_weights() + + #crf + self.start_tag, self.end_tag = num_tags, num_tags + 1 + self.transitions = nn.Parameter(torch.Tensor(num_tags + 2, num_tags + 2)) + nn.init.constant_(self.transitions, -np.log(num_tags)) + self.transitions.data[self.end_tag, :] = -10000 + self.transitions.data[:, self.start_tag] = -10000 + + def forward(self, input_ids, attention_mask, start_idx_mask, labels, **kwargs): + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + output_attentions=True, + output_hidden_states=True, + return_dict=True + ) + token_embedding = self.dropout(outputs.last_hidden_state) + #get start idx embedding of each tokens + start_idx_lens = start_idx_mask.sum(1).view(-1) + embd_selected = torch.masked_select(token_embedding, + start_idx_mask.unsqueeze(2)).view(-1,token_embedding.size()[-1]) + embd_split = torch.split(embd_selected, start_idx_lens.tolist()) + embd_padded = pad_sequence(embd_split, batch_first=True, padding_value=0) + logits = self.classifier(embd_padded) + logits= F.log_softmax(logits, dim=-1) + mask = labels.ne(tag_pad_id) + loss = self.get_crf_loss(logits, labels, mask) + outputs = (loss,) + if not self.training: + sentence_score, pred_tag = self.viterbi_decode_batch(logits, mask) + pred_tag = nn.ConstantPad1d((0, 200 - pred_tag.size(1)), tag_pad_id)(pred_tag) + outputs += (pred_tag,) + return outputs + + def viterbi_decode_batch(self, emissions, mask): + seq_len = emissions.shape[1] + options = dict(dtype=emissions.dtype, device=emissions.device) + + log_prob = emissions[:, 0].clone() + log_prob += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0) + + end_scores = log_prob + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + + best_scores_list = [] + best_scores_list.append(end_scores.unsqueeze(1)) + + best_paths_0 = torch.Tensor().long().to(emissions.device) + best_paths_list = [best_paths_0] + + for idx in range(1, seq_len): + broadcast_emissions = emissions[:, idx].unsqueeze(1) + broadcast_transmissions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0) + broadcast_log_prob = log_prob.unsqueeze(2) + score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob + max_scores, max_score_indices = torch.max(score, 1) + best_paths_list.append(max_score_indices.unsqueeze(1)) + end_scores = max_scores + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + + best_scores_list.append(end_scores.unsqueeze(1)) + log_prob = max_scores + + best_scores = torch.cat(best_scores_list, 1).float() + best_paths = torch.cat(best_paths_list, 1) + + max_scores, max_indices_from_scores = torch.max(best_scores, 2) + + valid_index_tensor = torch.tensor(0, **options).long() + padding_tensor = torch.tensor(tag_pad_id, **options).long() + + labels = max_indices_from_scores[:, seq_len - 1] + labels = torch.where(mask[:, seq_len - 1] != 1.0, padding_tensor, labels) + all_labels = labels.unsqueeze(1).long() + + for idx in range(seq_len - 2, -1, -1): + indices_for_lookup = all_labels[:, -1].clone() + indices_for_lookup = torch.where(indices_for_lookup == tag_pad_id, valid_index_tensor, + indices_for_lookup) + + indices_from_prev_pos = best_paths[:, idx, :].gather(1, indices_for_lookup.view(-1, 1).long()).squeeze(1) + indices_from_prev_pos = torch.where(mask[:, idx + 1] != 1.0, padding_tensor, indices_from_prev_pos) + + indices_from_max_scores = max_indices_from_scores[:, idx] + indices_from_max_scores = torch.where(mask[:, idx + 1] == 1.0, padding_tensor, indices_from_max_scores) + + labels = torch.where(indices_from_max_scores == tag_pad_id, indices_from_prev_pos, + indices_from_max_scores) + + # Set to ignore_index if present state is not valid. + labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels) + all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1) + + last_tag_indices = mask.sum(1, dtype=torch.long) - 1 + sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1) + + return sentence_score, torch.flip(all_labels, [1]) + + def get_log_p_z(self, emissions, mask): + seq_len = emissions.shape[1] + log_alpha = emissions[:, 0].clone() + log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0) + + for idx in range(1, seq_len): + broadcast_emissions = emissions[:, idx].unsqueeze(1) + broadcast_transitions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0) + broadcast_logprob = log_alpha.unsqueeze(2) + score = broadcast_logprob + broadcast_emissions + broadcast_transitions + + score = torch.logsumexp(score, 1) + log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (1.0 - mask[:, idx].unsqueeze(1)) + + log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0) + return torch.logsumexp(log_alpha.squeeze(1), 1) + + def get_log_p_Y_X(self, emissions, mask, orig_tags): + seq_len = emissions.shape[1] + tags = orig_tags.clone() + tags[tags < 0] = 0 + + llh = self.transitions[self.start_tag, tags[:, 0]].unsqueeze(1) + llh += emissions[:, 0, :].gather(1, tags[:, 0].view(-1, 1)) * mask[:, 0].unsqueeze(1) + + for idx in range(1, seq_len): + old_state, new_state = ( + tags[:, idx - 1].view(-1, 1), + tags[:, idx].view(-1, 1), + ) + emission_scores = emissions[:, idx, :].gather(1, new_state) + transition_scores = self.transitions[old_state, new_state] + llh += (emission_scores + transition_scores) * mask[:, idx].unsqueeze(1) + + last_tag_indices = mask.sum(1, dtype=torch.long) - 1 + last_tags = tags.gather(1, last_tag_indices.view(-1, 1)) + + llh += self.transitions[last_tags.squeeze(1), self.end_tag].unsqueeze(1) + + return llh.squeeze(1) + + def log_likelihood(self, emissions, tags, mask): + log_z = self.get_log_p_z(emissions, mask) + log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags) + return log_p_y_x - log_z + + def get_crf_loss(self, logits, y, mask): + s_lens = mask.sum(1) + loss = -1 * self.log_likelihood(logits, y, mask.float()) + loss = loss / s_lens.float() + loss = loss.mean() + return loss diff --git a/transformer_models/train_roberta.py b/transformer_models/train_roberta.py new file mode 100644 index 0000000..1be5ddc --- /dev/null +++ b/transformer_models/train_roberta.py @@ -0,0 +1,62 @@ +import time +import random +import numpy as np + +import torch +from transformers import Trainer, TrainingArguments +from torch.utils.data import random_split + +from .dataset_roberta import SpanDataset, variable_collate_fn +from .eval_utils import compute_metrics +from .model import CRFBert + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available() > 0: + torch.cuda.manual_seed_all(seed) + +def train(): + all_dataset = SpanDataset('train') + train_size = int(0.99 * len(all_dataset)) + test_size = len(all_dataset) - train_size + train_dataset, eval_dataset = random_split(all_dataset, [train_size, test_size]) + + model = CRFBert.from_pretrained('roberta-base') + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f'Number of trainable parameter: {n_params}') + + training_args = TrainingArguments( + output_dir=f'./results_{int(time.time())}', # output directory + num_train_epochs=3, # total # of training epochs + per_device_train_batch_size=8, # batch size per device during training + per_device_eval_batch_size=8, # batch size for evaluation + warmup_steps=500, # number of warmup steps for learning rate scheduler + weight_decay=0.01, # strength of weight decay + logging_dir='./logs', # directory for storing logs + save_total_limit=1, + seed=42, + label_names=["padded_labels"] + ) + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + data_collator=variable_collate_fn, + ) + #trainer.train() + #trainer.save_model() + result = trainer.evaluate() + for d in data_iter: + print(d) + +if __name__ == '__main__': + set_seed(42) + train() + + +