Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Håvard Vika Røen
PubMedToCsv
Commits
c0758ab8
Commit
c0758ab8
authored
Nov 25, 2020
by
Håvard Vika Røen
Browse files
Initial
parent
6343b5ee
Changes
5
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
c0758ab8
...
...
@@ -3,3 +3,5 @@
################################################################################
/PubMedToCsv/bin/Debug/netcoreapp3.1
/PubMedToCsv/obj
*.suo
PubMedToCsv.sln
0 → 100644
View file @
c0758ab8
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30621.155
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PubMedToCsv", "PubMedToCsv\PubMedToCsv.csproj", "{69DB8E3C-19CE-404E-8642-E9DC91A64297}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Debug|Any CPU.Build.0 = Debug|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Release|Any CPU.ActiveCfg = Release|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {7648CFD8-9C2E-420C-8A0F-67B7A60F2FC9}
EndGlobalSection
EndGlobal
PubMedToCsv/Program.cs
0 → 100644
View file @
c0758ab8
using
System
;
using
System.Collections.Generic
;
using
System.Globalization
;
using
System.IO
;
using
System.Linq
;
using
System.Text
;
using
System.Threading.Tasks
;
using
CommandLine
;
using
CsvHelper
;
using
HtmlAgilityPack
;
using
ScrapySharp.Extensions
;
using
ScrapySharp.Html
;
using
ScrapySharp.Html.Forms
;
using
ScrapySharp.Network
;
namespace
PubMedToCsv
{
class
Program
{
public
class
Options
{
[
Option
(
's'
,
"search"
,
Required
=
true
,
HelpText
=
"Search pubmed"
)]
public
string
Search
{
get
;
set
;
}
[
Option
(
'f'
,
"filename"
,
Required
=
false
,
HelpText
=
"Output filename"
)]
public
string
Filename
{
get
;
set
;
}
=
"PubMedExport_"
+
DateTime
.
Now
.
ToString
(
"yyyyMMdd"
)
+
".csv"
;
}
public
static
async
Task
Main
(
string
[]
args
)
{
await
Parser
.
Default
.
ParseArguments
<
Options
>(
args
).
MapResult
(
(
Options
o
)
=>
DoWork
(
o
),
errs
=>
Task
.
FromResult
(
0
));
}
private
static
async
Task
DoWork
(
Options
o
)
{
ScrapingBrowser
browser
=
new
ScrapingBrowser
();
//set UseDefaultCookiesParser as false if a website returns invalid cookies format
//browser.UseDefaultCookiesParser = false;
WebPage
homePage
=
browser
.
NavigateToPage
(
new
Uri
(
"https://pubmed.ncbi.nlm.nih.gov"
));
PageWebForm
form
=
homePage
.
FindFormById
(
"search-form"
);
int
pageSize
=
200
;
form
[
"term"
]
=
o
.
Search
;
//"DNA damage, nano*, high througput";
form
[
"size"
]
=
pageSize
.
ToString
();
form
.
Method
=
HttpVerb
.
Get
;
WebPage
resultsPage
=
form
.
Submit
();
HtmlNode
resultCount
=
resultsPage
.
Html
.
CssSelect
(
"div.results-amount span.value"
).
First
();
int
count
=
Convert
.
ToInt32
(
resultCount
.
InnerText
.
Replace
(
","
,
""
),
CultureInfo
.
InvariantCulture
);
int
pagecount
=
1
;
if
(
count
>
pageSize
)
{
decimal
d
=
(
decimal
)
count
/
(
decimal
)
pageSize
;
pagecount
=
Convert
.
ToInt32
(
Math
.
Ceiling
(
d
));
Console
.
WriteLine
(
"parsing "
+
pagecount
+
" results in "
+
pagecount
+
" pages"
);
}
//HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("a.docsum-title").ToArray();
//resultsLinks.Where(l => l.Name == "href");
List
<
PubMedResult
>
pubmedresults
=
new
List
<
PubMedResult
>();
pubmedresults
.
AddRange
(
ParsePubmedResults
(
resultsPage
));
for
(
int
i
=
2
;
i
<=
pagecount
;
i
++)
{
Console
.
Write
(
"\rparsing page "
+
i
+
"/"
+
pagecount
);
form
[
"page"
]
=
i
.
ToString
();
resultsPage
=
form
.
Submit
();
pubmedresults
.
AddRange
(
ParsePubmedResults
(
resultsPage
));
}
CsvHandler
.
SaveCsv
(
pubmedresults
,
o
.
Filename
);
Console
.
WriteLine
(
o
.
Filename
+
" saved"
);
//WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();
// From Web
//var url = "http://html-agility-pack.net/";
//var web = new HtmlWeb();
//var html = web.Load(url);
//var divs = html.CssSelect("div"); //all div elements
//var nodes = html.CssSelect("div.content"); //all div elements with css class ‘content’
//var nodes = html.CssSelect("div.widget.monthlist"); //all div elements with the both css class
//var nodes = html.CssSelect("#postPaging"); //all HTML elements with the id postPaging
//var nodes = html.CssSelect("div#postPaging.testClass"); // all HTML elements with the id postPaging and css class testClass
//var nodes = html.CssSelect("div.content > p.para"); //p elements who are direct children of div elements with css class ‘content’
//var nodes = html.CssSelect("input[type=text].login"); // textbox with css class login
}
private
static
List
<
PubMedResult
>
ParsePubmedResults
(
WebPage
resultsPage
)
{
HtmlNode
[]
results
=
resultsPage
.
Html
.
CssSelect
(
"div.search-results-chunk.results-chunk div.docsum-content"
).
ToArray
();
List
<
PubMedResult
>
pubmedresults
=
new
List
<
PubMedResult
>();
foreach
(
HtmlNode
divContent
in
results
)
{
PubMedResult
res
=
new
PubMedResult
();
var
lnk
=
divContent
.
ChildNodes
.
FindFirst
(
"a"
);
res
.
Url
=
"https://pubmed.ncbi.nlm.nih.gov"
+
lnk
.
Attributes
.
First
(
a
=>
a
.
Name
==
"href"
).
Value
;
res
.
Title
=
lnk
.
InnerText
.
Trim
();
res
.
Authors
=
divContent
.
CssSelect
(
"span.docsum-authors.full-authors"
).
First
().
InnerText
.
Trim
();
res
.
Doi
=
divContent
.
CssSelect
(
"span.docsum-journal-citation.full-journal-citation"
).
First
().
InnerText
.
Trim
();
try
{
res
.
Doi
=
res
.
Doi
.
Substring
(
res
.
Doi
.
IndexOf
(
"doi:"
));
}
catch
{
}
pubmedresults
.
Add
(
res
);
int
i
=
0
;
}
return
pubmedresults
;
}
}
public
class
PubMedResult
{
public
string
Title
{
get
;
set
;
}
public
string
Url
{
get
;
set
;
}
public
string
Authors
{
get
;
set
;
}
public
string
Doi
{
get
;
set
;
}
}
public
static
class
CsvHandler
{
//public const string _Dateformat = "yyyy.MM.dd HH:mm";
//public static NumberFormatInfo _Nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." };
public
static
void
SaveCsv
(
IEnumerable
<
PubMedResult
>
records
,
string
path
)
{
using
StreamWriter
writer
=
new
StreamWriter
(
path
);
using
CsvWriter
csv
=
new
CsvWriter
(
writer
,
CultureInfo
.
InvariantCulture
);
csv
.
Configuration
.
Delimiter
=
"\t"
;
csv
.
WriteRecords
(
records
);
}
}
}
\ No newline at end of file
PubMedToCsv/Properties/launchSettings.json
0 → 100644
View file @
c0758ab8
{
"profiles"
:
{
"PubMedToCsv"
:
{
"commandName"
:
"Project"
,
"commandLineArgs"
:
"--search
\"
covid
\"
"
}
}
}
\ No newline at end of file
PubMedToCsv/PubMedToCsv.csproj
0 → 100644
View file @
c0758ab8
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.1</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="CommandLineParser" Version="2.8.0" />
<PackageReference Include="CsvHelper" Version="16.1.0" />
<PackageReference Include="ScrapySharp" Version="3.0.0" />
</ItemGroup>
</Project>
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment