그 외

[그 외] etri mrc json 파일을 df로 바꾸는 코드

Dong's Universe 2023. 6. 18. 14:17
with open("../data/20181101_ETRI_MRC_v1.json", 'r') as f:
    jf = json.load(f)

titles = []
contexts = []
questions = []
ids = []
answers = []
document_ids = []
index_level_0s = []

print(len(jf['data']))
for data in jf['data']:
    title = data['title']
    context = data['paragraphs'][0]['context']
    question = data['paragraphs'][0]['qas'][0]['question']
    id = data['paragraphs'][0]['qas'][0]['id']
    answer = {'answer_start': [data['paragraphs'][0]['qas'][0]['answers'][0]['answer_start']],
               'text': [data['paragraphs'][0]['qas'][0]['answers'][0]['text']]}
    document_id = 0
    index_level_0 = 0
    
    for ele in data['paragraphs'][0]['qas']:
        question = ele['question']
        id = ele['id']
        answer = {'answer_start': [ele['answers'][0]['answer_start']],
               'text': [ele['answers'][0]['text']]}
        titles.append(title)
        contexts.append(context)
        questions.append(question)
        ids.append(id)
        answers.append(answer)
        document_ids.append(document_id)
        index_level_0s.append(index_level_0)
    

df = pd.DataFrame({
'title': titles,
'context': contexts,
'question': questions,
'id': ids,
'answers': answers,
'document_id': document_ids,
'__index_level_0__': index_level_0s
})

df