1234567891011121314151617181920212223242526 |
- #!/bin/bash
- transcripts_dir=Spotify-Podcasts-2020/podcasts-no-audio-13GB/spotify-podcasts-2020/podcasts-transcripts
- data_dir=Spotify-Podcasts-2020/exp_data/exp_data1
- stage=3
- if [ $stage -le 1 ]; then
- mkdir $data_dir/json_paths
- for d in $transcripts_dir/*; do
- name=$(basename $d)
- find $d | grep "json" > $data_dir/json_paths/$name
- done
- fi
- if [ $stage -le 2 ]; then
- mkdir -p $data_dir/xml_dir
- for i in $(seq 0 7); do
- mkdir $data_dir/xml_dir/$i
- sbatch --wrap="python3 scripts/json_to_doc.py $data_dir/json_paths/$i $data_dir/xml_dir/$i"
- done
- fi
- if [ $stage -le 3 ]; then
- find $data_dir/xml_dir/* | grep "\.xml" > $data_dir/xml_file_path
- sbatch --wrap="python3 scripts/create_index.py $data_dir/xml_file_path $data_dir/index"
- fi
|