Browse Source

Add indexing files

yasu 3 years ago
commit
e92444639e
3 changed files with 115 additions and 0 deletions
  1. 31 0
      scripts/create_index.py
  2. 58 0
      scripts/retrieval.py
  3. 26 0
      scripts/run_index.sh

+ 31 - 0
scripts/create_index.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import pyterrier as pt
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('xml_list')
+    parser.add_argument('index_dir')
+    return parser.parse_args()
+
+if __name__=="__main__":
+    args = parse_arguments()
+
+    pt.init()
+
+    xml_list = []
+    with open(args.xml_list) as ifile:
+        for line in ifile:
+            line = line.rstrip('\n')
+            xml_list.append(line)
+
+    indexer = pt.TRECCollectionIndexer('./'+args.index_dir)
+    index_properies = {
+            "block.indexing":"true",
+            "invertedfile.lexiconscanner":"pointers",
+            'indexer.meta.forward.keys': 'docno',
+            'indexer.meta.forward.keylens': '50'}
+    indexer.setProperties(**index_properies)
+    indexref = indexer.index(xml_list)

+ 58 - 0
scripts/retrieval.py

@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import pandas as pd
+import pyterrier as pt
+
+id_template = 'spotify:episode:{}_{}'
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_properties')
+    parser.add_argument('topics')
+    parser.add_argument('run_id', type=str)
+    parser.add_argument('--format',
+            choices=['trec', 'submission'], default='trec')
+    return parser.parse_args()
+
+def write_submission(df):
+    print('RUNID QUERYID RANK SCORE EPISODEID OFFSET')
+    for t in df.itertuples():
+        qid, rank, docno, score = t[1], t[2], t[3], t[4]
+
+        episode, timestamp = docno.split('_')
+        start_time = str(float(timestamp.split('-')[0]))
+        episode_id = id_template.format(episode, start_time)
+        print('{} {} {} {} {} {}'.format(
+            args.run_id, qid, rank, score, episode_id, start_time))
+
+def write_trec(df):
+    print('query-id Q0 document-id rank score STANDARD')
+    for t in df.itertuples():
+        qid, rank, docno, score = t[1], t[2], t[3], t[4]
+
+        episode, timestamp = docno.split('_')
+        start_time = str(float(timestamp.split('-')[0]))
+        episode_id = id_template.format(episode, start_time)
+        print('{} {} {} {} {} {}'.format(
+            qid, '0', episode_id, rank, score, args.run_id))
+
+if __name__=="__main__":
+    args = parse_arguments()
+
+    pt.init()
+
+    index_dir = './' + args.data_properties
+    index_ref = pt.IndexRef.of(index_dir)
+    index = pt.IndexFactory.of(index_ref)
+
+    topics = pt.Utils.parse_trecxml_topics_file(args.topics)
+    retr = pt.BatchRetrieve(index)
+    res = retr.transform(topics)
+
+    df = pd.DataFrame(res, columns=['qid', 'rank', 'docno', 'score'])
+    if args.format == 'trec':
+        write_trec(df)
+    elif args.format == 'submission':
+        write_submission(df)

+ 26 - 0
scripts/run_index.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+transcripts_dir=Spotify-Podcasts-2020/podcasts-no-audio-13GB/spotify-podcasts-2020/podcasts-transcripts
+data_dir=Spotify-Podcasts-2020/exp_data/exp_data1
+stage=3
+
+if [ $stage -le 1 ]; then
+    mkdir $data_dir/json_paths
+    for d in $transcripts_dir/*; do
+        name=$(basename $d)
+        find $d | grep "json" > $data_dir/json_paths/$name
+    done
+fi
+
+if [ $stage -le 2 ]; then
+    mkdir -p $data_dir/xml_dir
+    for i in $(seq 0 7); do
+        mkdir $data_dir/xml_dir/$i
+        sbatch --wrap="python3 scripts/json_to_doc.py $data_dir/json_paths/$i $data_dir/xml_dir/$i"
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    find $data_dir/xml_dir/* | grep "\.xml" > $data_dir/xml_file_path
+    sbatch --wrap="python3 scripts/create_index.py $data_dir/xml_file_path $data_dir/index"
+fi