002 Stata数据导入与导出

平时学习中我们经常会面对各种格式的数据,如.dta .txt .csv .xls .xlsx等。不同的数据格式,有不同的方法。

  • 直接输入:input
  • .dta:sysuse、use 、webuse、save
  • .txt :import delimited、export delimited、infile、outfile、infix、fileread()、insheet、outsheet
  • .csv:import delimited、export delimited、insheet、outsheet
  • .xls与.xlsx:import excel、export excel


input str20 name age str6 sex //输入字符型变量需要定义字节长度
"A.Doyle" 22 male
"Mary Hope" 37 female
"Guy Fawkes" 48 male




sysuse auto, clear
save myauto.dta, replace

use myauto.dta, clear
replace mpg = 1 in 1
save myauto.dta, replace

webuse lifeexp.dta, clear  //从stata网站上获取数据
save lifeexp.dta, replace


insheet using score.csv, clear  //变量没有变量名

insheet name age test1 test2 using score.csv, clear  //设置每个变量的变量名

insheet using score.txt, clear  //以制表符分隔的文件

insheet using score1.txt, clear //空格分隔的文件,需要转换

insheet name age test1 test2 using score1.txt, clear delimiter(" ") //用delimiter参数把空格定义为分隔符

insheet reads into memory from a disk a dataset that is not in Stata format. insheet is intended for reading files created by a spreadsheet or
database program. Regardless of the creator of the file, insheet reads text (ASCII) files in which there is 1 observation per line and the values
are separated by tabs or commas.
Also the first line of the file can contain the variable names.

outsheet using score2.txt, replace
shellout score2.txt

outsheet using score2.txt, noquote replace //使用noquote选项删除字符型变量的引号
shellout score2.txt

outsheet using score2.txt, noquote noname replace //使用noname选项不导出变量名
shellout score2.txt

shellout -- Opens documents and their programs from inside Stata.
可以使用 ssc install outreg2 安装shellout

*3.import delimited
import delimited name age test1 using score.txt, rowrange(2:4) colrange(1:3) clear //rowrange colrange设置读入的行列范围

import delimited name age test1 using score.txt, rowrange(2) colrange(:3) clear //从第2行到最后,从首列到第3列

clear //注意1:转码前要清空内存中的数据
copy "http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpManager/stockid/600018.phtml"temp.txt,replace 
unicode encoding set gb18030 //原网页是gb2312,gb18030是gb2312的扩展
unicode translate temp.txt, transutf8 //将文件转成utf8  注意2:转码的文件前面不能加路径
unicode erasebackups, badidea //删除备份文件,避免后续冲突 但应慎用,特别是对于dta文件

import delimited using temp.txt, encoding("gb18030")

import delimited using temp.txt, encoding("gb18030")  delimiter("b1.ak;UI", asstring)clear  //通过delimiter的asstring选项将字符串整体作为分隔符

import delimited 相比insheet而言,可以实现转码,分隔符的多样化指定。

*4.export delimited
sysuse auto, clear
export delimited using auto.txt, replace //和outsheet功能基本相同,但是导出的字符变量默认是没有引号
export delimited using auto.txt, quote replace
shellout auto.txt
infix str5 name 1-5 age 7-8 test1 9-10 test2 11-12 using score3.txt, clear  //字符串的长度上限是str2045,如果超过了就需要用strL

copy "http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpManager/stockid/600018.phtml"temp.txt,replace 
infix strL v 1-100000 using temp.txt, clear

replace v = ustrfrom(v,"gb18030",1)



tempname handle
file open `handle' using 吃瓜俱乐部.txt, write replace
file write `handle' "范冰冰和李晨离婚了"_n
file write `handle' "宋慧乔和宋仲基离婚了"_n
file write `handle' "将吃瓜进行到底"
file close `handle'
shellout 吃瓜俱乐部.txt

insheet using 吃瓜俱乐部.txt, clear
import delimited using 吃瓜俱乐部.txt, encoding("utf-8") clear
infix str50 v 1-30 using 吃瓜俱乐部.txt, clear

需要注意,对于import delimited不添加encoding("utf-8"),会出现乱码。

encoding("encoding") specifies the encoding of the text file to be read. The default is encoding("latin1"). Specify encoding("utf-8") for files to
be encoded in UTF-8.

set obs 1
gen v = fileread("吃瓜俱乐部.txt") //fileread()函数将文本文档内容读入到一个单元格里

set obs 2
gen v = fileread("吃瓜俱乐部.txt") //两行重复



import excel using 利润表.xls, first case(lower) clear //将第一行设为变量名,并以小写字母形式显示变量
import excel using 利润表.xls, first case(lower) clear sheet("2009")
import excel using 利润表.xls, first case(lower) clear sheet("2010")

import excel using 利润表.xls, describe //给出该表内所有sheet的信息
return list
forvalues i=1/`r(N_worksheet)'{
    dis "this is `r(worksheet_`i')'"
    import excel using 利润表.xls, first case(lower) sheet(`r(worksheet_`i')') clear
    save 利润表_`i', replace

forvalues i = 1/`r(N_worksheet)'{
    append using 利润表_`i'
save 利润表.dta, replace
import excel using 利润表1.xls, describe
import excel using 利润表1.xls, cellrange(A3:F12) clear //读入A3到F12
import excel using 利润表1.xls, cellrange(A2) first clear //从A2读到最后

import excel using 利润表2.xls, cellrange(A2) first clear
drop in 1
destring _all, replace //单位在下载时没有删除,可以读入后删除第一行,然后将字符型转为数值型
sysuse auto, clear
export excel using auto.xlsx, replace //直接导出
export excel using auto1.xlsx, replace first(variables) //导出第一行为变量名
export excel using auto2.xlsx, replace first(varlabels)  //导出第一行为变量标签
export excel using auto.xlsx, cell(B2) sheet("newsheet", replace) first(variables) //将内容导出到newsheet中,从B2单元格开始导出数据



