google-research
diff --git a/‎.gitignore
Lines changed: 9 additions & 0 deletions b/‎.gitignore
Lines changed: 9 additions & 0 deletions
diff --git a/‎lanistr/dataset/amazon/amazon_utils.py
Lines changed: 2 additions & 2 deletions b/‎lanistr/dataset/amazon/amazon_utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lanistr/dataset/amazon/download_images.py
Lines changed: 12 additions & 12 deletions b/‎lanistr/dataset/amazon/download_images.py
Lines changed: 12 additions & 12 deletions
diff --git a/‎lanistr/dataset/amazon/load_data.py
Lines changed: 70 additions & 39 deletions b/‎lanistr/dataset/amazon/load_data.py
Lines changed: 70 additions & 39 deletions
diff --git a/‎lanistr/model/modeling_lanistr.py
Lines changed: 29 additions & 12 deletions b/‎lanistr/model/modeling_lanistr.py
Lines changed: 29 additions & 12 deletions
diff --git a/‎lanistr/scripts/download_amazon.sh
Lines changed: 3 additions & 3 deletions b/‎lanistr/scripts/download_amazon.sh
Lines changed: 3 additions & 3 deletions
diff --git a/‎third_party/mvts_transformer/LICENSE renamed to ‎lanistr/third_party/mvts_transformer/LICENSE b/‎third_party/mvts_transformer/LICENSE renamed to ‎lanistr/third_party/mvts_transformer/LICENSE
diff --git a/‎third_party/mvts_transformer/timeseries_encoder.py renamed to ‎lanistr/third_party/mvts_transformer/timeseries_encoder.py b/‎third_party/mvts_transformer/timeseries_encoder.py renamed to ‎lanistr/third_party/mvts_transformer/timeseries_encoder.py
diff --git a/‎third_party/tabnet/LICENSE renamed to ‎lanistr/third_party/tabnet/LICENSE b/‎third_party/tabnet/LICENSE renamed to ‎lanistr/third_party/tabnet/LICENSE
diff --git a/‎third_party/tabnet/tabular_encoder.py renamed to ‎lanistr/third_party/tabnet/tabular_encoder.py b/‎third_party/tabnet/tabular_encoder.py renamed to ‎lanistr/third_party/tabnet/tabular_encoder.py
@@ -0,0 +1,9 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# data folder contains downloadable public datasets
+data/
+
+# Editor
+.vscode/
@@ -42,11 +42,11 @@ def load_multimodal_data(args: omegaconf.DictConfig) -> pd.DataFrame:
       A pandas DataFrame containing the loaded data.
   """
   if args.task == "pretrain":
-    path = os.path.join(args.data_dir, f"{args.category}_total.json.gz")
+    path = os.path.join(args.data_dir, f"{args.category}.json.gz")
     data = read_gzip(path)
   else:
     path_to_clean_data = os.path.join(
-        args.data_dir, f"{args.category}_total.csv"
+        args.data_dir, f"{args.category}.csv"
     )
     data = pd.read_csv(path_to_clean_data)
     data = data.reset_index(drop=True)
 
@@ -17,7 +17,7 @@
 import gzip
 import json
 import os
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import omegaconf
 import pandas as pd
@@ -60,7 +60,7 @@ def load_and_clean_meta_data(
   return metadata
 
 
-def read_gzip(name: str, args: omegaconf.DictConfig) -> List[dict[str, str]]:
+def read_gzip(name: str, args: omegaconf.DictConfig) -> List[Dict[str, str]]:
   """Reads a gzipped file and returns a list of JSON objects.
 
   Args:
@@ -103,8 +103,8 @@ def load_data(args: omegaconf.DictConfig) -> pd.DataFrame:
 def get_reviews(
     row: pd.Series,
     index: int,
-    nan_indices_summary: list[int],
-    nan_indices_review_text: list[int],
+    nan_indices_summary: List[int],
+    nan_indices_review_text: List[int],
 ) -> Optional[str]:
   """Extracts and cleans the review text from a row of data.
 
@@ -135,7 +135,7 @@ def get_reviews(
 
 
 def get_review_votes(
-    row: pd.Series, index: int, nan_indices_votes: list[int]
+    row: pd.Series, index: int, nan_indices_votes: List[int]
 ) -> int:
   """Extracts and cleans the review vote from a row of data.
 
@@ -181,7 +181,7 @@ def get_product_brands(
 
 
 def download_and_save_image(
-    image_data_dir: str, urls: list[str], index: int
+    image_data_dir: str, urls: List[str], index: int
 ) -> Optional[str]:
   """Downloads and saves the image from a URL.
 
@@ -236,7 +236,7 @@ def get_product_prices(
 
 
 def get_review_names(
-    row: pd.Series, index: int, nan_indices_reviewer_names: list[int]
+    row: pd.Series, index: int, nan_indices_reviewer_names: List[int]
 ) -> Optional[str]:
   """Extracts and cleans the reviewer name from a row of data.
 
@@ -389,7 +389,7 @@ def main():
     # row = data.iloc[index]
 
     meta_row = meta_data.loc[meta_data['asin'] == row['asin'].item()]
-    meta_data_exists = True if meta_row else False
+    meta_data_exists = False if meta_row.empty else True
 
     amazon_image_exists = False
     user_image_exists = False
@@ -443,8 +443,8 @@ def main():
 
   categorical_cols = ['reviewerID', 'verified', 'asin', 'year']
   numerical_cols = ['vote', 'unixReviewTime']
-  image_col = ['ImageFileName']
-  text_col = ['Review']
+  image_cols = ['ImageFileName']
+  text_cols = ['Review']
   label_col = ['labels']
 
   d = pd.DataFrame()
@@ -474,8 +474,8 @@ def main():
       if item
       not in categorical_cols
       + numerical_cols
-      + image_col
-      + text_col
+      + image_cols
+      + text_cols
       + label_col
   ]
 
 
@@ -56,6 +56,8 @@ def load_amazon(
       )
   )
   feature_names = categorical_cols + numerical_cols
+  image_names = ['ImageFileName']
+  text_names = ['Review']
   train_data, test_data, valid_data = get_train_and_test_splits(
       args, amazon_data
   )
@@ -65,6 +67,8 @@ def load_amazon(
       'cat_idxs': cat_idxs,
       'cat_dims': cat_dims,
       'feature_names': feature_names,
+      'image_names': image_names,
+      'text_names': text_names,
   }
 
   dataframes = {
@@ -103,6 +107,8 @@ def create_multimodal_dataset_from_dataframes(
       tokenizer=tokenizer,
       transform=train_transform,
       feature_names=dataframes['tabular_data_information']['feature_names'],
+      image_names=dataframes['tabular_data_information']['image_names'],
+      text_names=dataframes['tabular_data_information']['text_names'],
       text=args.text,
       image=args.image,
       tab=args.tab,
@@ -113,6 +119,8 @@ def create_multimodal_dataset_from_dataframes(
       tokenizer=tokenizer,
       transform=test_transform,
       feature_names=dataframes['tabular_data_information']['feature_names'],
+      image_names=dataframes['tabular_data_information']['image_names'],
+      text_names=dataframes['tabular_data_information']['text_names'],
       text=args.text,
       image=args.image,
       tab=args.tab,
@@ -123,6 +131,8 @@ def create_multimodal_dataset_from_dataframes(
       tokenizer=tokenizer,
       transform=train_transform,
       feature_names=dataframes['tabular_data_information']['feature_names'],
+      image_names=dataframes['tabular_data_information']['image_names'],
+      text_names=dataframes['tabular_data_information']['text_names'],
       text=args.text,
       image=args.image,
       tab=args.tab,
@@ -146,6 +156,8 @@ def __init__(
       tokenizer: transformers.BertTokenizer,
       transform: torchvision.transforms.Compose,
       feature_names: List[str],
+      image_names: List[str],
+      text_names: List[str],
       text: bool,
       image: bool,
       tab: bool,
@@ -157,7 +169,9 @@ def __init__(
         df: The dataframe to use for the dataset.
         tokenizer: The tokenizer to use for the text.
         transform: The transform to use for the images.
-        feature_names: The names of the features to use.
+        feature_names: The names of the features columns.
+        image_names: The names of the image columns.
+        text_names: The names of the text columns.
         text: Whether to use text.
         image: Whether to use images.
         tab: Whether to use tabular data.
@@ -171,7 +185,10 @@ def __init__(
       self.features = self.df[feature_names].values
 
     if text:
-      self.reviews = df['Review'].values
+      self.texts = df[text_names].values
+
+    if image:
+      self.images = df[image_names].values
 
     self.mask_generator = MaskGenerator(
         input_size=args.image_size,
@@ -199,48 +216,37 @@ def __getitem__(self, index: int):
 
     # text
     if self.text:
-      review = self.reviews[index]
-
-      try:
-        item = self.tokenizer.encode_plus(
-            review,
-            max_length=self.args.max_token_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_token_type_ids=False,
-            padding='max_length',
-            return_attention_mask=True,
-            return_tensors='pt',
-        )
-      except Exception as e:  # pylint: disable=broad-exception-caught
-        print(e)
-        item = self.tokenizer.encode_plus(
-            '',
-            max_length=self.args.max_token_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_token_type_ids=False,
-            padding='max_length',
-            return_attention_mask=True,
-            return_tensors='pt',
-        )
+      input_ids_list = []
+      attention_mask_list = []
+      for text in self.texts[index]:
+        encode_result = self.encode_text(text)
+        input_ids_list.append(encode_result['input_ids'])
+        attention_mask_list.append(encode_result['attention_mask'])
+      # input_ids has shape (text_num, token_length)
+      item['input_ids'] = torch.cat(input_ids_list)
+      # attention_mask has shape (text_num, token_length)
+      item['attention_mask'] = torch.cat(attention_mask_list)
 
     # image
     if self.image:
-      image_filename = row['ImageFileName']
-      if isinstance(image_filename, str):
-        image_path = os.path.join(self.args.image_data_dir, image_filename)
-        img = Image.open(image_path).convert('RGB')
-        img = self.transform(img)
-        item['pixel_values'] = img
-        item['bool_masked_pos'] = self.mask_generator()
-      else:
-
-        item['pixel_values'] = torch.zeros(
+      pixel_values = []
+      bool_masked_positions = []
+      for image_data in self.images[index]:
+        if isinstance(image_data, str):
+          image_path = os.path.join(self.args.image_data_dir, image_data)
+          img = Image.open(image_path).convert('RGB')
+          img = self.transform(img)
+          pixel_values.append(img)
+        else:
+          pixel_values.append(torch.zeros(
             size=(3, self.args.image_size, self.args.image_size),
             dtype=torch.float,
-        )
-        item['bool_masked_pos'] = self.mask_generator()
+          ))
+        bool_masked_positions.append(self.mask_generator())
+      # pixel_values has shape (image_num, channel, width, height)
+      item['pixel_values'] = torch.stack(pixel_values)
+      # bool_masked_positions has shape (image_num, model_patch_size**2)
+      item['bool_masked_positions'] = torch.stack(bool_masked_positions)
 
     # tabular
     if self.tab:
@@ -261,3 +267,28 @@ def __len__(self) -> int:
         The length of the dataset.
     """
     return len(self.df)
+  
+  def encode_text(self, text: str):
+    try:
+      return self.tokenizer.encode_plus(
+          text,
+          max_length=self.args.max_token_length,
+          truncation=True,
+          add_special_tokens=True,
+          return_token_type_ids=False,
+          padding='max_length',
+          return_attention_mask=True,
+          return_tensors='pt',
+      )
+    except Exception as e:  # pylint: disable=broad-exception-caught
+      print(e)
+      return self.tokenizer.encode_plus(
+          '',
+          max_length=self.args.max_token_length,
+          truncation=True,
+          add_special_tokens=True,
+          return_token_type_ids=False,
+          padding='max_length',
+          return_attention_mask=True,
+          return_tensors='pt',
+      )
@@ -382,29 +382,46 @@ def forward(self, batch: Mapping[str, torch.Tensor]) -> BaseModelOutput:
     embeds = []
     ##================================= Text =================================##
     if self.args.text:
-      batch['input_ids'] = batch['input_ids'].squeeze(1)
-      batch['attention_mask'] = batch['attention_mask'].squeeze(1)
-
-      # forwarding regular inputs:
-      outputs = self.text_encoder(
-          input_ids=batch['input_ids'],
-          attention_mask=batch['attention_mask'],
+      # batch['input_ids'] has shape (batch_size text_num, id_length), e.g. [4, 2, 512].
+      batch_size = batch['input_ids'].shape[0]
+      text_num = batch['input_ids'].shape[1]
+      text_contents = batch['input_ids'].flatten(start_dim=0, end_dim=1)
+      attention_mask = batch['attention_mask'].flatten(start_dim=0, end_dim=1)
+      
+      text_encoding = self.text_encoder(
+          input_ids=text_contents,
+          attention_mask=attention_mask,
       )
-      last_hidden_state = outputs.last_hidden_state
+      last_hidden_state = text_encoding.last_hidden_state
       text_embeddings = self.text_proj(
           last_hidden_state[:, self.target_token_idx, :]
       )
+      text_embeddings = text_embeddings.reshape(tuple([batch_size, text_num] + list(text_embeddings.shape)[1:]))
+    
+      # Average the embeddings for all the text inputs.
+      text_embeddings = text_embeddings.mean(dim=1, keepdim=True)
 
+      # TODO(Reviewer): the internal code doesn't have normalization. Do we need this? Is the dimension correct? text_embeddings has shape (batch_size, dim1, dim2)
       text_embeddings = F.normalize(text_embeddings, dim=1)
-      embeds.append(text_embeddings.unsqueeze(dim=1))
+      embeds.append(text_embeddings)
 
     ##================================== Image ===============================##
     if self.args.image:
+      # batch['pixel_values'] has shape (batch_size, image_num, channel, width, height), e.g. [4, 2, 3, 224, 224].
+      batch_size = batch['pixel_values'].shape[0]
+      image_num = batch['pixel_values'].shape[1]
+      images = batch['pixel_values'].flatten(start_dim=0, end_dim=1)
 
-      image_features = self.image_encoder(
-          pixel_values=batch['pixel_values'], bool_masked_pos=None
+      image_encodings = self.image_encoder(
+          pixel_values=images, bool_masked_pos=None
       )
-      image_embeddings = self.image_proj(image_features.last_hidden_state)
+      image_embeddings = self.image_proj(image_encodings.last_hidden_state)
+      image_embeddings = image_embeddings.reshape(
+          tuple([batch_size, image_num] + list(image_embeddings.shape)[1:])
+      )
+      image_embeddings = image_embeddings.mean(dim=1)
+
+      # TODO(Reviewer): the internal code doesn't have normalization. Do we need this? Is the dimension correct? image_embeddings has shape (batch_size, dim1, dim2)
       image_embeddings = F.normalize(image_embeddings, dim=1)
       embeds.append(image_embeddings)
 
 
@@ -37,8 +37,8 @@ wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/metaFiles2/meta_
 cd ../../../
 
 # the following take nearly 30 minutes each.
-python datasets/amazon/download_images.py --category All_Beauty
-python datasets/amazon/download_images.py --category AMAZON_FASHION
+python dataset/amazon/download_images.py --category All_Beauty
+python dataset/amazon/download_images.py --category AMAZON_FASHION
 
 # this will take many hours but it goes by fast because there are not too many images
-python datasets/amazon/download_images.py --category Office_Products
+python dataset/amazon/download_images.py --category Office_Products