run_index.sh 793 B

1234567891011121314151617181920212223242526
  1. #!/bin/bash
  2. transcripts_dir=Spotify-Podcasts-2020/podcasts-no-audio-13GB/spotify-podcasts-2020/podcasts-transcripts
  3. data_dir=Spotify-Podcasts-2020/exp_data/exp_data1
  4. stage=3
  5. if [ $stage -le 1 ]; then
  6. mkdir $data_dir/json_paths
  7. for d in $transcripts_dir/*; do
  8. name=$(basename $d)
  9. find $d | grep "json" > $data_dir/json_paths/$name
  10. done
  11. fi
  12. if [ $stage -le 2 ]; then
  13. mkdir -p $data_dir/xml_dir
  14. for i in $(seq 0 7); do
  15. mkdir $data_dir/xml_dir/$i
  16. sbatch --wrap="python3 scripts/json_to_doc.py $data_dir/json_paths/$i $data_dir/xml_dir/$i"
  17. done
  18. fi
  19. if [ $stage -le 3 ]; then
  20. find $data_dir/xml_dir/* | grep "\.xml" > $data_dir/xml_file_path
  21. sbatch --wrap="python3 scripts/create_index.py $data_dir/xml_file_path $data_dir/index"
  22. fi